1. I can'tuse cilk_spawn to call a function, which has a return value. Thus, I used cilk_for instead, see below example code - function Cilk_For_Pi()

2. I tried cilk_spawn to get returnvalueinfunction's parameter, see below example code -function Cilk_Spawn_Pi().

Both methods have correct result, butperformance of case 2) is worse than serial code.(I work on T61, WinXP,Composer 2011 beta build 0.16)

Regards, Peter

Example code - (you can remove VTune and Advisor code, if you can't build up them)

#include

#include

#include

#include

#include

#include

#include "tbb/task_scheduler_init.h"

#include "tbb/blocked_range.h"

#include "tbb/parallel_for.h"

#include "tbb/spin_mutex.h"

#include "tbb/mutex.h"

#include "tbb/tick_count.h"

#include

#include "Vtuneapi.h"

#include "annotate.h"

using namespace tbb;

using namespace std;

const int num_steps = 200000000;

const int num_threads = 8; // My laptop is T61, cores - 2

double step = 0.0, pi = 0.0;

static tbb::spin_mutex myMutex;

static CRITICAL_SECTION cs;

void Serial_Pi()

{

double x, sum = 0.0;

int i;

// ANNOTATE_SITE_BEGIN(101)

for (i=0; i< num_steps; i++){

// ANNOTATE_TASK_BEGIN(201)

x = (i+0.5)*step;

sum = sum + 4.0/(1.0 + x*x);

// ANNOTATE_TASK_END(201)

}

// ANNOTATE_SITE_END(101)

pi = step * sum;

}

DWORD WINAPI threadFunction(LPVOID pArg)

{

double partialSum = 0.0, x; // local to each thread

int myNum = *((int *)pArg);

for ( int i=myNum; i{

x = (i + 0.5)*step;

partialSum += 4.0 / (1.0 + x*x); //compute partial sums at each thread

}

EnterCriticalSection(&cs);

pi += partialSum * step; // add partial to global final answer

LeaveCriticalSection(&cs);

return 0;

}

void WinThread_Pi()

{

HANDLE threadHandles[num_threads];

int tNum[num_threads];

InitializeCriticalSection(&cs);

for ( int i=0; i{

tNum[i] = i;

threadHandles[i] = CreateThread( NULL, // Security attributes

0, // Stack size

threadFunction, // Thread function

(LPVOID)&tNum[i],// Data for thread func()

0, // Thread start mode

NULL); // Returned thread ID

}

WaitForMultipleObjects(num_threads, threadHandles, TRUE, INFINITE);

}

void OpenMP_Pi()

{

double x, sum=0.0;

int i;

omp_set_num_threads(4/*omp_get_thread_num()*/);

#pragma omp parallel for private (x) reduction(+:sum) //schedule(static,4)

for (i=0; i{

x = (i + 0.5)*step;

sum = sum + 4.0/(1. + x*x);

}

pi = sum*step;

}

class ParallelPi {

public:

void operator() (tbb::blocked_range& range) const {

double x, sum = 0.0;

for (int i = range.begin(); i < range.end(); ++i) {

x = (i+0.5)*step;

sum = sum + 4.0/(1.0 + x*x);

}

tbb::spin_mutex::scoped_lock lock(myMutex);

pi += step * sum;

}

};

void TBB_Pi ()

{

parallel_for (tbb::blocked_range (0, num_steps), ParallelPi(), tbb::auto_partitioner());

}

void TBB_Lambda_Pi()

{

parallel_for(blocked_range(0, num_steps, 5000 /*grain_size*/),

[](const blocked_range &r){

double x, sum = 0.0;

for (int i = r.begin(); i!=r.end(); ++i){

x = (i + 0.5)*step;

sum = sum + 4.0/(1. + x*x);

};

tbb::spin_mutex::scoped_lock lock(myMutex);

pi += step * sum;

});

}

double compute_sub_pi(int begin, int end)

{

double x, ret_sum=0.0;

for (int i=begin; i{

x = (i+0.5)*step;

ret_sum = ret_sum + 4.0/(1. + x*x);

}

return ret_sum;

}

void Cilk_For_Pi()

{

double sum=0.0, partial_sum[num_threads];

cilk_for (int i=0; i{

partial_sum[i] = compute_sub_pi(i*num_steps/num_threads, (i+1)*num_steps/num_threads);

}

for (int i=0; i

pi += step * sum;

}

void compute_spawn_pi(int begin, int end, int thread_id, double *partial_sum)

{

double x;

int begin_next, end_next, thread_id_next;

// prepare cilk_spawn for another thread

thread_id_next = thread_id+1;

begin_next = begin + num_steps/num_threads;

end_next = end + num_steps/num_threads;

if (thread_id_next cilk_spawn compute_spawn_pi(begin_next, end_next, thread_id_next, partial_sum);

}

partial_sum[thread_id] = 0.0;

for (int i=begin; i{

x = (i+0.5)*step;

partial_sum[thread_id] = partial_sum[thread_id] + 4.0/(1. + x*x);

}

}

void Cilk_Spawn_Pi()

{

double sum=0.0, partial_sum[num_threads];

compute_spawn_pi(0, num_steps/num_threads, 0, partial_sum);

cilk_sync;

for (int i=0; i

pi += step * sum;

}

int main()

{

clock_t start, stop;

typedef void (*VTFUNC)(void);

HMODULE hMod;

VTFUNC vtResume, vtPause;

hMod = LoadLibrary("VtuneApi.dll");

vtResume = (VTFUNC) GetProcAddress(hMod, "VTResume");

vtPause = (VTFUNC) GetProcAddress(hMod, "VTPause");

//printf ("Handle is %lx, vtResume is %lx, vtPause is %lx\\n", hMod, vtResume, vtPause);

// Coputing pi by using serial code

pi = 0.0;

step = 1.0/(double) num_steps;

start = clock();

Serial_Pi();

stop = clock();

printf ("Computed value of Pi by using serial code: %12.9f\\n", pi);

printf ("Elapsed time: %.2f seconds\\n", (double)(stop-start)/1000.0);

// Computing pi by using Windows Threads

pi = 0.0;

step = 1.0 / (double)num_steps;

start = clock();

WinThread_Pi();

stop = clock();

printf ("Computed value of Pi by using WinThreads: %12.9f\\n", pi);

printf ("Elapsed time: %.2f seconds\\n", (double)(stop-start)/1000.0);

//(vtResume());

// Computing pi by using OpenMP

pi = 0.0;

step = 1.0 / (double)num_steps;

start = clock();

OpenMP_Pi();

stop = clock();

printf ("Computed value of Pi by using OpenMP: %12.9f\\n", pi);

printf ("Elapsed time: %.2f seconds\\n", (double)(stop-start)/1000.0);

// Computing pi by using TBB

pi = 0.0;

step = 1.0 / (double)num_steps;

start = clock();

tbb::task_scheduler_init tbb_init;

TBB_Pi();

stop = clock();

printf ("Computed value of Pi by using TBB: %12.9f\\n", pi);

printf ("Elapsed time: %.2f seconds\\n", (double)(stop-start)/1000.0);

//(vtPause());

// Computing pi by using TBB's Lambda

pi = 0.0;

step = 1.0 / (double)num_steps;

start = clock();

//tbb::task_scheduler_init tbb_init;

TBB_Lambda_Pi();

stop = clock();

printf ("Computed value of Pi by using TBB's Lambda: %12.9f\\n", pi);

printf ("Elapsed time: %.2f seconds\\n", (double)(stop-start)/1000.0);

// Computing pi by using Cilk For

pi = 0.0;

step = 1.0 / (double)num_steps;

start = clock();

Cilk_For_Pi();

stop = clock();

printf ("Computed value of Pi by using cilk_for: %12.9f\\n", pi);

printf ("Elapsed time: %.2f seconds\\n", (double)(stop-start)/1000.0);

// Computing pi by using Cilk Spawn

pi = 0.0;

step = 1.0 / (double)num_steps;

start = clock();

Cilk_Spawn_Pi();

stop = clock();

printf ("Computed value of Pi by using cilk_spawn: %12.9f\\n", pi);

printf ("Elapsed time: %.2f seconds\\n", (double)(stop-start)/1000.0);

return 0;

}