Cilk Plus does quite well speeding up this slightly modified version of quick-sort for integers.
#include
#include
#include
#include
#ifdef __cilk
#include
#else
#define cilk_spawn
#define cilk_sync
#endif
#define THRESH 16
void inssort(int *a, int l, int r){
int i,j,v,t;
for(i=r; i>l; i--)if(a[i-1] > a[i]){t=a[i-1]; a[i-1]=a[i]; a[i]=t;}
for(i=l+2; i<=r; i++){
j=i; v=a[i];
while(v < a[j-1]){
a[j]=a[j-1]; j--;
}
a[j]=v;
}
}
void qsort3(int *A,int p,int r){
/* Input A(p:r), rnk (p <= rnk <= r). Returns the value of the element of rank rnk.
The array is partitioned about A[rnk], as well */
int k,t,q,rnd,f,x,j,rnk,p0,r0; // median is all that we want
lq: p0=p, r0=r; rnk=(p+r)/2;
if (r <= p) {
cilk_sync;
return;
}
if(r+1-p <= THRESH){
inssort(A,p,r);
cilk_sync;
return;
}
while(1){
if(p==r){ q=p; break;}
rnd=rand();
k=p+(r-p)*(double)rnd/(double)RAND_MAX+0.5;
t=A[k]; A[k]=A[r]; A[r]=t; /* randomize */
q=p-1; x=A[r]; //Inline version of q=rpart(A,p,r);
for(j=p; j r0-r){
cilk_spawn qsort3(A,p0,p);
p=r; r=r0; goto lq; // Avoid tail recursion qsort3(A,r,r0);
}
else {
cilk_spawn qsort3(A,r,r0);
r=p; p=p0; goto lq; // Avoid tail recursion qsort3(A,p0,p);
}
cilk_sync; // superfluous, but keep for safety
}
int main(int argc, char* argv[]) {
int size,*a,i,med; int c1,c2; int nl,nr;
printf("Threshhold = %dn",THRESH);
for(size=0x4000; size <= 0x1000000; size+=size){
a = malloc(sizeof(int)*size);
if(!a){
fprintf(stderr,"Could not allocate space for %08X integersn",size);
exit(1);
}
for(i=0; iHowever, CilkView gives a rather pessimistic estimate of 2.2 for the upper bound on parallel speed-up. On Windows 7 Home Premium X64 running on a Dell XPS 17 laptop with i7-2720QM, I get speed-ups that are never less than 3.0.
[bash]s:sortsav>icl /O3 /DNOCHECK /MD ssort.c
Intel C++ Compiler XE for applications running on IA-32, Version 12.0.4.196 Build 20110427
s:sortsav>cilkview -trials one ssort.exe cilkview: running with 8 workers Threshhold = 16 M 00016384 0.001 M 00032768 0.001 M 00065536 0.002 M 00131072 0.003 M 00262144 0.006 M 00524288 0.012 M 01048576 0.023 M 02097152 0.038 M 04194304 0.078 M 08388608 0.153 M 16777216 0.293 cilkview: generating scalability data Cilkview Scalability Analyzer V2.0.0, Build 1113 Threshhold = 16 M 00016384 0.120 M 00032768 0.052 M 00065536 0.060 M 00131072 0.079 M 00262144 0.111 M 00524288 0.178 M 01048576 0.279 M 02097152 0.448 M 04194304 0.834 M 08388608 1.528 M 16777216 2.923 Whole Program Statistics 1) Parallelism Profile Work : 10,511,027,859 instructions Span : 4,781,368,295 instructions Burdened span : 4,790,901,465 instructions Parallelism : 2.20 Burdened parallelism : 2.19 Number of spawns/syncs: 433,563 Average instructions / strand : 8,081 Strands along span : 767 Average instructions / strand on span : 6,233,856 Total number of atomic instructions : 44 Frame count : 867,137 2) Speedup Estimate 2 processors: 1.13 - 2.00 4 processors: 1.20 - 2.20 8 processors: 1.25 - 2.20 16 processors: 1.27 - 2.20 32 processors: 1.28 - 2.20



