Two issues in cilk

Two issues in cilk

Bild des Benutzers Peter Wang (Intel)

1. I can'tuse cilk_spawn to call a function, which has a return value. Thus, I used cilk_for instead, see below example code - function Cilk_For_Pi()

2. I tried cilk_spawn to get returnvalueinfunction's parameter, see below example code -function Cilk_Spawn_Pi().

Both methods have correct result, butperformance of case 2) is worse than serial code.(I work on T61, WinXP,Composer 2011 beta build 0.16)

Regards, Peter

Example code - (you can remove VTune and Advisor code, if you can't build up them)

#include
#include
#include
#include
#include
#include

#include "tbb/task_scheduler_init.h"
#include "tbb/blocked_range.h"
#include "tbb/parallel_for.h"
#include "tbb/spin_mutex.h"
#include "tbb/mutex.h"
#include "tbb/tick_count.h"

#include

#include "Vtuneapi.h"

#include "annotate.h"

using namespace tbb;
using namespace std;

const int num_steps = 200000000;
const int num_threads = 8; // My laptop is T61, cores - 2
double step = 0.0, pi = 0.0;

static tbb::spin_mutex myMutex;
static CRITICAL_SECTION cs;

void Serial_Pi()
{
double x, sum = 0.0;
int i;

// ANNOTATE_SITE_BEGIN(101)
for (i=0; i< num_steps; i++){
// ANNOTATE_TASK_BEGIN(201)
x = (i+0.5)*step;
sum = sum + 4.0/(1.0 + x*x);
// ANNOTATE_TASK_END(201)
}
// ANNOTATE_SITE_END(101)
pi = step * sum;
}

DWORD WINAPI threadFunction(LPVOID pArg)
{

double partialSum = 0.0, x; // local to each thread
int myNum = *((int *)pArg);

for ( int i=myNum; i{
x = (i + 0.5)*step;
partialSum += 4.0 / (1.0 + x*x); //compute partial sums at each thread
}

EnterCriticalSection(&cs);
pi += partialSum * step; // add partial to global final answer
LeaveCriticalSection(&cs);

return 0;
}

void WinThread_Pi()
{
HANDLE threadHandles[num_threads];
int tNum[num_threads];

InitializeCriticalSection(&cs);
for ( int i=0; i{
tNum[i] = i;
threadHandles[i] = CreateThread( NULL, // Security attributes
0, // Stack size
threadFunction, // Thread function
(LPVOID)&tNum[i],// Data for thread func()
0, // Thread start mode
NULL); // Returned thread ID
}
WaitForMultipleObjects(num_threads, threadHandles, TRUE, INFINITE);

}

void OpenMP_Pi()
{
double x, sum=0.0;
int i;

omp_set_num_threads(4/*omp_get_thread_num()*/);
#pragma omp parallel for private (x) reduction(+:sum) //schedule(static,4)

for (i=0; i{
x = (i + 0.5)*step;
sum = sum + 4.0/(1. + x*x);
}

pi = sum*step;
}

class ParallelPi {

public:

void operator() (tbb::blocked_range& range) const {
double x, sum = 0.0;
for (int i = range.begin(); i < range.end(); ++i) {
x = (i+0.5)*step;
sum = sum + 4.0/(1.0 + x*x);
}
tbb::spin_mutex::scoped_lock lock(myMutex);
pi += step * sum;
}
};

void TBB_Pi ()
{
parallel_for (tbb::blocked_range (0, num_steps), ParallelPi(), tbb::auto_partitioner());
}

void TBB_Lambda_Pi()
{
parallel_for(blocked_range(0, num_steps, 5000 /*grain_size*/),
[](const blocked_range &r){
double x, sum = 0.0;
for (int i = r.begin(); i!=r.end(); ++i){
x = (i + 0.5)*step;
sum = sum + 4.0/(1. + x*x);
};
tbb::spin_mutex::scoped_lock lock(myMutex);
pi += step * sum;
});

}

double compute_sub_pi(int begin, int end)
{
double x, ret_sum=0.0;

for (int i=begin; i{
x = (i+0.5)*step;
ret_sum = ret_sum + 4.0/(1. + x*x);
}

return ret_sum;
}

void Cilk_For_Pi()
{
double sum=0.0, partial_sum[num_threads];

cilk_for (int i=0; i{
partial_sum[i] = compute_sub_pi(i*num_steps/num_threads, (i+1)*num_steps/num_threads);
}

for (int i=0; i

pi += step * sum;
}

void compute_spawn_pi(int begin, int end, int thread_id, double *partial_sum)
{
double x;
int begin_next, end_next, thread_id_next;

// prepare cilk_spawn for another thread
thread_id_next = thread_id+1;
begin_next = begin + num_steps/num_threads;
end_next = end + num_steps/num_threads;

if (thread_id_next cilk_spawn compute_spawn_pi(begin_next, end_next, thread_id_next, partial_sum);
}

partial_sum[thread_id] = 0.0;
for (int i=begin; i{
x = (i+0.5)*step;
partial_sum[thread_id] = partial_sum[thread_id] + 4.0/(1. + x*x);
}

}

void Cilk_Spawn_Pi()
{
double sum=0.0, partial_sum[num_threads];

compute_spawn_pi(0, num_steps/num_threads, 0, partial_sum);
cilk_sync;

for (int i=0; i

pi += step * sum;
}

int main()
{

clock_t start, stop;

typedef void (*VTFUNC)(void);
HMODULE hMod;
VTFUNC vtResume, vtPause;

hMod = LoadLibrary("VtuneApi.dll");

vtResume = (VTFUNC) GetProcAddress(hMod, "VTResume");
vtPause = (VTFUNC) GetProcAddress(hMod, "VTPause");
//printf ("Handle is %lx, vtResume is %lx, vtPause is %lx\\n", hMod, vtResume, vtPause);

// Coputing pi by using serial code
pi = 0.0;
step = 1.0/(double) num_steps;
start = clock();
Serial_Pi();
stop = clock();
printf ("Computed value of Pi by using serial code: %12.9f\\n", pi);
printf ("Elapsed time: %.2f seconds\\n", (double)(stop-start)/1000.0);

// Computing pi by using Windows Threads
pi = 0.0;
step = 1.0 / (double)num_steps;
start = clock();
WinThread_Pi();
stop = clock();
printf ("Computed value of Pi by using WinThreads: %12.9f\\n", pi);
printf ("Elapsed time: %.2f seconds\\n", (double)(stop-start)/1000.0);

//(vtResume());

// Computing pi by using OpenMP
pi = 0.0;
step = 1.0 / (double)num_steps;
start = clock();
OpenMP_Pi();
stop = clock();
printf ("Computed value of Pi by using OpenMP: %12.9f\\n", pi);
printf ("Elapsed time: %.2f seconds\\n", (double)(stop-start)/1000.0);

// Computing pi by using TBB
pi = 0.0;
step = 1.0 / (double)num_steps;
start = clock();
tbb::task_scheduler_init tbb_init;
TBB_Pi();
stop = clock();
printf ("Computed value of Pi by using TBB: %12.9f\\n", pi);
printf ("Elapsed time: %.2f seconds\\n", (double)(stop-start)/1000.0);

//(vtPause());

// Computing pi by using TBB's Lambda
pi = 0.0;
step = 1.0 / (double)num_steps;
start = clock();
//tbb::task_scheduler_init tbb_init;
TBB_Lambda_Pi();
stop = clock();
printf ("Computed value of Pi by using TBB's Lambda: %12.9f\\n", pi);
printf ("Elapsed time: %.2f seconds\\n", (double)(stop-start)/1000.0);

// Computing pi by using Cilk For
pi = 0.0;
step = 1.0 / (double)num_steps;
start = clock();
Cilk_For_Pi();
stop = clock();
printf ("Computed value of Pi by using cilk_for: %12.9f\\n", pi);
printf ("Elapsed time: %.2f seconds\\n", (double)(stop-start)/1000.0);

// Computing pi by using Cilk Spawn
pi = 0.0;
step = 1.0 / (double)num_steps;
start = clock();
Cilk_Spawn_Pi();
stop = clock();
printf ("Computed value of Pi by using cilk_spawn: %12.9f\\n", pi);
printf ("Elapsed time: %.2f seconds\\n", (double)(stop-start)/1000.0);

return 0;
}

12 Beiträge / 0 neu
Letzter Beitrag
Nähere Informationen zur Compiler-Optimierung finden Sie in unserem Optimierungshinweis.
Bild des Benutzers Peter Wang (Intel)

Here are test results -

C:\Documents and Settings\zwang14\Desktop\zwang\OEMs_ISVs\demo\Pi\Release>pi
Computed value of Pi by using serial code: 3.141592654
Elapsed time: 4.17 seconds
Computed value of Pi by using WinThreads: 3.141592654
Elapsed time: 2.41 seconds
Computed value of Pi by using OpenMP: 3.141592654
Elapsed time: 2.30 seconds
Computed value of Pi by using TBB: 3.141592654
Elapsed time: 2.45 seconds
Computed value of Pi by using TBB's Lambda: 3.141592654
Elapsed time: 2.58 seconds
Computed value of Pi by using cilk_for: 3.141592654
Elapsed time: 2.78 seconds
Computed value of Pi by using cilk_spawn: 3.141592654
Elapsed time: 6.17 seconds

Hi,

your post made me fell curious.

I've tried a simplified version of your code (no TBB) in XP with 2 cores and got the following:

Computed value of Pi by using serial code: 3.141592654

Elapsed time: 3.00 seconds

Computed value of Pi by using WinThreads: 3.141592654

Elapsed time: 1.83 seconds

Computed value of Pi by using cilk_for: 3.141592654

Elapsed time: 1.49 seconds

Computed value of Pi by using cilk_spawn: 3.141592654

Elapsed time: 1.48 seconds

Maybe something with mixing all???
I'm thinking of use it to try a reducer but it'll have to wait

Best regards

Manuel

Bild des Benutzers Jennifer J. (Intel)

Yes, very interesting test indeed.

Manuel,
could you attach your code so I could verify why your cilk_spawn runs faster?

Peter,
In your original code, the function "compute_spawn_pi()" contains "cilk_spawn compute_spawn_pi()". And note that there is an implicit sync at the end of the function that contains "cilk_spawn". So this makes the "compute_spawn_pi()" serial.

do you agree?

Jennifer

Bild des Benutzers Jennifer J. (Intel)

Hi Peter,
About the #1 question, cilk_spawn can take functions with return value. the format is like:

int ret = cilk_spawn my_func(a);

doesit not work for you?

Jennifer

Bild des Benutzers Peter Wang (Intel)

Quoting Jennifer Jiang (Intel)Hi Peter,
About the #1 question, cilk_spawn can take functions with return value. the format is like:

int ret = cilk_spawn my_func(a);

doesit not work for you?

Jennifer

Jennifer:
Yes.This works for me. Thank you.
-Peter

Bild des Benutzers Peter Wang (Intel)

Quoting Jennifer Jiang (Intel)Yes, very interesting test indeed.

Manuel,
could you attach your code so I could verify why your cilk_spawn runs faster?

Peter,
In your original code, the function "compute_spawn_pi()" contains "cilk_spawn compute_spawn_pi()". And note that there is an implicit sync at the end of the function that contains "cilk_spawn". So this makes the "compute_spawn_pi()" serial.

do you agree?

Jennifer

Thank you all.

However I got same result when I testedmy codeon other machine. Please attached your test code - maybe you addedsome advancedcode that I haven't realized it.

Jennifer, I saw "There is an implicit cilk_sync at the end of every function and every try block that contains a cilk_spawn."- I agree many time spent on "wait", but disagree "compute_spawn_pi()" is serial. At least the performance should be better than serial code. Unfortunately it did not.

I used Intel? Thread Profiler to show you info about using"cilk_spawn" and "cilk_for" respectively. Definitely thereare problemsincilk_spawn's implementation, are we using different Composer's versions?

Regards, Peter

TP's resultshows for using cilk_spawn (it costs 7.5s)

TP's result showsfor using "cilk_for" (it only costs 2.5s)

Bild des Benutzers Jennifer J. (Intel)

Hi Peter,
Yes, I agree that there is a performance issue.

Even with the suggested cilk-for like below, it runs so much slower than the serial code. The reason is related to vectorization. When cilk-for is used, vectorization isn't done. I'll file a bug report to the vectorization about it.

void Jenn_Cilk_For_Pi()
{
    cilk::reducer_opadd sum(0.0); 
	double x;

	cilk_for (int i=0; i

Also attached the changed code for you testing.

Thanks for your test case.
Jennifer

Anlagen: 

AnhangGröße
Herunterladen c.cpp5.87 KB
Bild des Benutzers Peter Wang (Intel)

Jennifer,

Thanks for yourmodified code, I got trainedon using cilk_for_reducer:-)

Note that the result of computing piis 3.141680438for"Jennifer cilk_for" - right value should be 3.141592654. Please check. Was that caused by using sum.get_value()?

Regards, Peter

Using a reducer is the cleanest approach. In the current beta release, this approach has performance problems - there are some optimizations that are not yet implemented in that version. With those optimizations, the reducer approach will not incur any performance penalty. In the code you show, there is a race on the variable "x". Better to declare it inside the for loop:double x = (i+0.5)*step;steve

Hi Manuel,

Thanks so much for using this private beta forum like I had requested in my private email I'd sent out - appreciate much. As you can see forum posts in our forum gets response from all of our supoort folks including some from cross product division support teams too (FYI).

-regards,
Kittur

Hi, again

Sorry if I couldn't folllow the posts. Anyway it seems that things have moved since my last time.

The code I used was pretty the samePeter used. I'll paste it if it is any help..

#include "stdafx.h"

#include

#include

#include

#include

#include

#include

#include

using namespace std;

const int num_steps = 200000000;

const int num_threads = 8; // My laptop is T61, cores - 2

double step = 0.0, pi = 0.0;

//static tbb::spin_mutex myMutex;

static CRITICAL_SECTION cs;

void Serial_Pi()

{

double x, sum = 0.0;

int i;

// ANNOTATE_SITE_BEGIN(101)

for (i=0; i< num_steps; i++){

// ANNOTATE_TASK_BEGIN(201)

x = (i+0.5)*step;

sum = sum + 4.0/(1.0 + x*x);

// ANNOTATE_TASK_END(201)

}

// ANNOTATE_SITE_END(101)

pi = step * sum;

}

DWORD WINAPI threadFunction(LPVOID pArg)

{

double partialSum = 0.0, x; // local to each thread

int myNum = *((int *)pArg);

for ( int i=myNum; i

{

x = (i + 0.5)*step;

partialSum += 4.0 / (1.0 + x*x); //compute partial sums at each thread

}

EnterCriticalSection(&cs);

pi += partialSum * step; // add partial to global final answer

LeaveCriticalSection(&cs);

return 0;

}

void WinThread_Pi()

{

HANDLE threadHandles[num_threads];

int tNum[num_threads];

InitializeCriticalSection(&cs);

for ( int i=0; i

{

tNum[i] = i;

threadHandles[i] = CreateThread( NULL, // Security attributes

0, // Stack size

threadFunction, // Thread function

(LPVOID)&tNum[i],// Data for thread func()

0, // Thread start mode

NULL); // Returned thread ID

}

WaitForMultipleObjects(num_threads, threadHandles, TRUE, INFINITE);

}

double compute_sub_pi(int begin, int end)

{

double x, ret_sum=0.0;

for (int i=begin; i

{

x = (i+0.5)*step;

ret_sum = ret_sum + 4.0/(1. + x*x);

}

return ret_sum;

}

void Cilk_For_Pi()

{

double sum=0.0, partial_sum[num_threads];

cilk_for (int i=0; i

{

partial_sum[i] = compute_sub_pi(i*num_steps/num_threads, (i+1)*num_steps/num_threads);

}

for (int i=0; i

pi += step * sum;

}

void compute_spawn_pi(int begin, int end, int thread_id, double *partial_sum)

{

double x;

int begin_next, end_next, thread_id_next;

// prepare cilk_spawn for another thread

thread_id_next = thread_id+1;

begin_next = begin + num_steps/num_threads;

end_next = end + num_steps/num_threads;

if (thread_id_next

cilk_spawn compute_spawn_pi(begin_next, end_next, thread_id_next, partial_sum);

}

partial_sum[thread_id] = 0.0;

for (int i=begin; i

{

x = (i+0.5)*step;

partial_sum[thread_id] = partial_sum[thread_id] + 4.0/(1. + x*x);

}

}

void Cilk_Spawn_Pi()

{

double sum=0.0, partial_sum[num_threads];

compute_spawn_pi(0, num_steps/num_threads, 0, partial_sum);

cilk_sync;

for (int i=0; i

pi += step * sum;

}

int cilk_main(int argc, char* argv[])

{

clock_t start, stop;

typedef void (*VTFUNC)(void);

HMODULE hMod;

// Coputing pi by using serial code

pi = 0.0;

step = 1.0/(double) num_steps;

start = clock();

Serial_Pi();

stop = clock();

printf ("Computed value of Pi by using serial code: %12.9f\n", pi);

printf ("Elapsed time: %.2f seconds\n", (double)(stop-start)/1000.0);

// Computing pi by using Windows Threads

pi = 0.0;

step = 1.0 / (double)num_steps;

start = clock();

WinThread_Pi();

stop = clock();

printf ("Computed value of Pi by using WinThreads: %12.9f\n", pi);

printf ("Elapsed time: %.2f seconds\n", (double)(stop-start)/1000.0);

// Computing pi by using Cilk For

pi = 0.0;

step = 1.0 / (double)num_steps;

start = clock();

Cilk_For_Pi();

stop = clock();

printf ("Computed value of Pi by using cilk_for: %12.9f\n", pi);

printf ("Elapsed time: %.2f seconds\n", (double)(stop-start)/1000.0);

// Computing pi by using Cilk Spawn

pi = 0.0;

step = 1.0 / (double)num_steps;

start = clock();

Cilk_Spawn_Pi();

stop = clock();

printf ("Computed value of Pi by using cilk_spawn: %12.9f\n", pi);

printf ("Elapsed time: %.2f seconds\n", (double)(stop-start)/1000.0);

return 0;

}

Best regards, Manuel

Melden Sie sich an, um einen Kommentar zu hinterlassen.