On my computer,when num_steps = 100000000,no matter the GrainSize = 50000000 or GrainSize = 10000000 or auto ,it takes 5.*s in most cases,a few cases2.*s.

When num_steps = 1000000000,it takes 50s in most cases,a few cases20s.

Environment:

AMD5000

Windows7

VS2008

#include #include #include "tbb/parallel_reduce.h" #include "tbb/task_scheduler_init.h" #include "tbb/blocked_range.h" using namespace std; using namespace tbb; int Nthreads = 2; int GrainSize = 50000000; long long num_steps = 100000000; class CMyPi { double *const my_step; public: double sum; void operator()(const blocked_range& r); CMyPi(CMyPi& x, split); void join(const CMyPi& y); CMyPi(double *const step); }; CMyPi::CMyPi(double *const step):my_step(step) { sum = 0.0; } CMyPi::CMyPi(CMyPi &x, tbb::split):my_step(x.my_step) { sum = 0.0; } void CMyPi::join(const CMyPi &y) { sum += y.sum; } // step = 1.0/(double)num_steps; // for (i=0; i < num_steps; i++) // { // x = (i+0.5)*step; // sum = sum + 4.0/(1.0 + x*x); // } void CMyPi::operator ()(const blocked_range& r) { double x = 0.0; for(int i = r.begin();i!=r.end();++i) { x = (i+0.5)* (*my_step); sum+=4.0/(1.0+x*x); } } int main(int argc, char* argv[]) { clock_t start, stop; double pi; double width = 1./(double)num_steps; CMyPi step((double *const)&width); task_scheduler_init init(task_scheduler_init::deferred); start = clock(); init.initialize(Nthreads); //TBB parallel_reduce(blocked_range(0,num_steps,GrainSize), step); // parallel_reduce(blocked_range(0,num_steps), step, auto_partitioner()); pi = step.sum*width; stop = clock(); cout << "The value of PI is " << pi << endl; cout << "The time to calculate PI was " << (double)(stop-start)/CLOCKS_PER_SEC << " secondsn"; system("pause"); return 0; } //#include //static long num_steps=100000; //double step; //void main() //{ int i; // double x, pi, sum = 0.0; // step = 1.0/(double)num_steps; // for (i=0; i < num_steps; i++) // { // x = (i+0.5)*step; // sum = sum + 4.0/(1.0 + x*x); // } // pi = step * sum; // printf(Pi = %fn,pi); //}