Hi,

I have this application which according to Intel advisor should get about 3.5 of speedup using 4 threads.

Using OpenMP I was able to get only ~ 2.35 with 2 and 2.85 with 4 threads.

Now I am learning and applying Cilk to see if I can improve this performance. My compiler is icc (ICC) 14.0.0 20130728.

I am using only **cilk_for** (similar to what I did with openMP). Running the application with one processor I get the same performance with Cilk and openMP. However adding more processors hurts the Cilk performance big time.

Below is the essential part of mi code. The parallel loops are inside the functions preM() and posM() functions which I also listed below.

Notice that both functions preM() and posM() are called form inside a double nested loop which are inside a while loop. Could that be the problem? too much overhead? I have being playing with #pragma cilk grainsize without luck.

Any help and/or comments will be appreciated.

Thanks in advance.

......

while (myBool && iter < 14) {

myBool=false;

iter++;

for (i=0; i<ncol; i++) {

for (j=nrow-1; j>i; j--) {

if (j >= ncol ) {

symmetric(a[i][i], 0.0, a[j][i], 0.0, &c1,&s1 );

} else {

symmetric(a[i][i], a[i][j], a[j][i], a[j][j],&c1,&s1 );

} // end if //

c = 1.0;

s = 0.0;

preM(a[i], a[j],ncol,c1,-s1);

if (j<ncol) {

diagonal(a[i][i],a[i][j], a[j][j], &c,&s);

preM(a[i], a[j],ncol,c,s);

posM(&a[0][i], &a[0][j],nrow,ncol,c,s);

posM(&v[0][i], &v[0][j],ncol,ncol,c,s);

} // end if //

preM(u[i], u[j],nrow,(c1*c + s1*s),(c1*s - s1*c));

} // end for //

} // end for //

.....

} // end while //

.....

++++++++++++++++++++ preM.c +++++++++++++++

#include <stdio.h>

#include <stdlib.h>

#include <cilk/cilk.h>

#include "real.h"

void preM (real *a,real *b, unsigned int nCol, real c, real s) {

unsigned int col;

// int gs = (nCol/__cilkrts_get_nworkers());

// #pragma cilk grainsize = 63

cilk_for (col=0; col<nCol;col++) {

real ri = a[col];

real rj = b[col];

a[col] = c*ri - s*rj;

b[col] = s*ri + c*rj;

} // end for //

} // end preM() //

++++++++++++++++++++ posM.c +++++++++++++++

#include <stdio.h>

#include <stdlib.h>

#include <cilk/cilk.h>

#include "real.h"

void posM (real *a,real *b, unsigned int nRow, unsigned int nCol, real c, real s) {

unsigned int row;

// int gs = (nCol/__cilkrts_get_nworkers());

// #pragma cilk grainsize = 63

cilk_for (row=0; row < nRow*nCol;row+=nCol) {

real ci = a[row];

real cj = b[row];

a[row] = c*ci - s*cj;

b[row] = s*ci + c*cj;

} // end for //

} // end posM() //