Hi, every one.

I want to calculate PI using multi-core parallel algorithms. The following is my code. The first part is written with TBB's parallel_reduce, and the second part with OpenMP's reduction. Although both will give the correct answer 3.141516, **Ifound that thereduction is much faster than the parallel_reduce.** For example, on my Intel i930 PC (with 4 cores), the TBB's parallel_reduce needs 1.9 seconds, while the OpenMP's reduction requires 0.98 seconds. I can not understand this problem.Would anyone like to give some advice ?

#include

#include

#include "tbb/tbb.h"

using namespace std;

using namespace tbb;

const int num_steps = 1000000000;

const double step = 1.0/num_steps;

double pi = 0.0;

class CMyPi

{

public:

double sum;

CMyPi() : sum(0.0) {}

void operator() (const blocked_range& r)

{

for(int i = r.begin();i!=r.end();++i)

{

double x = (i+0.5)*step;

sum += 4.0/(1.0 + x*x);

}

}

CMyPi(CMyPi& x, split) : sum(0.0) {}

void join(const CMyPi& y) { sum += y.sum; }

};

int main()

{

clock_t start, stop;

CMyPi myPi;

start = clock();

parallel_reduce(blocked_range(0, num_steps), myPi);

pi = step * myPi.sum;

stop = clock();

//cout << "The value of PI is " << pi << endl;

cout << "The time to calculate PI was " << (double)(stop-start)/CLOCKS_PER_SEC << " seconds\\n\\n";

start = clock();

double sum = 0.0;

#pragma omp parallel for reduction(+:sum)

for (int i=0; i {

double x = (i+0.5)*step;

sum += 4.0/(1.0 + x*x);

}

pi = step*sum;

stop = clock();

//cout << "The value of PI is " << pi << endl;

cout << "The time to calculate PI was " << (double)(stop-start)/CLOCKS_PER_SEC << " seconds\\n";

return 0;

}