hi ...
I am new to TBB, I developed a simple program to access the
performance of parallel_reduce and parallel_for against serial execution. the
program simpy sums up the content of a large array (order of 10M) and display the seq in which objects are copied and joined. To
my astonishment the parallized program is taking a lot more time
compared to the serial program to sum up the contents. Please give me
some explanation and solution... I am copying the program below.
I am using :
Processor : Core 2 Duo
OS : RHEL 4.0
PS : I even timed the program using time cmd
Program :
//-------------------------------------------------starts here-----------------------------------------
//to disable printing of values undef PRINT
#include
#include
#include
#include
#include
using namespace std;
using namespace tbb;
#define REAL double
#define VALUE 0.025;
#define DEBUG
#define MAX 10000000
#define GRAINSIZE 500000
#define PRINT
class ApplyFoo {
REAL *a;
size_t len;
#ifdef DEBUG
static unsigned int noo;
static unsigned int nob;
int objNo;
clock_t obtime;
static clock_t max_obtime;
static clock_t main_obtime;
#endif
public :
ApplyFoo(REAL array[], size_t length) {
a=array;
len=length;
#ifdef DEBUG
#ifdef PRINT
cout<<"
--------------------------ApplyFoo --------------------------------
";
cout<<"inside ApplyFoo(REAL array[], size_t length)
";
cout<<"Number of object new objt destroying objt"< #endif
objNo = nob;
main_obtime = -1;
noo++;
nob++;
while(main_obtime == -1) main_obtime = clock();
#ifdef PRINT
cout< #endif
#endif
}
ApplyFoo(const ApplyFoo& f) {
a = f.a;
len = f.len;
#ifdef DEBUG
obtime = -1;
noo++;
nob++;
objNo = nob;
while(obtime==-1) obtime = clock();
#ifdef PRINT
cout< #endif
#endif
}
void operator()(const blocked_range& r) const {
REAL *arr = a;
for(size_t i=r.begin(); i!=r.end(); i++) arr[i] = VALUE;
}
~ApplyFoo() {
#ifdef DEBUG
noo--;
obtime = clock() - obtime;
REAL fobtime = (double)obtime/CLOCKS_PER_SEC;
#ifdef PRINT
cout<<" "< #endif
if(max_obtime max_obtime = obtime;
}else if(objNo == 1) {
main_obtime = clock() - main_obtime;
#ifdef PRINT
cout<<"max_obtime : "<"<<(double)max_obtime/CLOCKS_PER_SEC< cout<<"main_obtime : "<"<<(double)main_obtime/CLOCKS_PER_SEC< #endif
}
#endif
}
#ifdef DEBUG
void display_time() {
cout<<"max_obtime : "<
cout<<"main_obtime : "< }
#endif
};
#ifdef DEBUG
unsigned int ApplyFoo::noo = 0;
unsigned int ApplyFoo::nob = 0;
clock_t ApplyFoo::max_obtime;
clock_t ApplyFoo::main_obtime;
#endif
#undef DEBUG
void ParallelApplyFoo(REAL a[], size_t n, size_t gs) {
ApplyFoo af(a,n);
parallel_for(blocked_range(0,n,gs), af);
#ifdef DEBUG
af.display_time();
#endif
}
#define DEBUG
class SumFoo {
REAL *a;
size_t len;
REAL sum;
#ifdef DEBUG
static unsigned int noo;
static unsigned int nob;
int objNo;
clock_t obtime;
static clock_t max_obtime;
static clock_t main_obtime;
#endif
public :
SumFoo(REAL array[], size_t length) {
a=array;
len=length;
sum = 0.0;
#ifdef PRINT
cout<<"
----------------------------- Summing using SumFoo -----------------------
";
#endif
#ifdef DEBUG
main_obtime = -1;
#ifdef PRINT
cout<<"inside SumFoo(REAL array[], size_t length)
";
cout<<"Number of object new objt destroying objt"< #endif
objNo = nob;
noo++;
nob++;
while(main_obtime == -1) main_obtime = clock();
#ifdef PRINT
cout< #endif
#endif
}
SumFoo(const SumFoo& f, split) {
a = f.a;
len = f.len;
 
; sum = 0.0;
#ifdef DEBUG
obtime = -1;
noo++;
nob++;
objNo = nob;
while(obtime==-1) obtime = clock();
#ifdef PRINT
cout< #endif
#endif
}
void operator()(const blocked_range& r) {
REAL *arr = a;
for(size_t i=r.begin(); i!=r.end(); i++) sum+=arr[i];
}
void displaysum() {
cout< }
void join(SumFoo &f) {
sum += f.sum;
#ifdef DEBUG
noo--;
f.obtime = clock() - f.obtime;
REAL fobtime = (double)f.obtime/CLOCKS_PER_SEC;
#ifdef PRINT
cout<<" "< #endif
if(max_obtime max_obtime = f.obtime;
}else if(f.objNo == 1) {
main_obtime = clock() - main_obtime;
#ifdef PRINT
cout<<"max_obtime : "<"<<(double)max_obtime/CLOCKS_PER_SEC< cout<<"main_obtime : "<"<<(double)main_obtime/CLOCKS_PER_SEC< #endif
}
#endif
}
#i
fdef DEBUG
void display_time() {
cout<<"max_obtime : "< cout<<"main_obtime : "< }
#endif
};
#ifdef DEBUG
unsigned int SumFoo::noo = 0;
unsigned int SumFoo::nob = 0;
clock_t SumFoo::max_obtime;
clock_t SumFoo::main_obtime;
#endif
//#undef DEBUG
void ParallelSumFoo(REAL a[], size_t n, size_t gs) {
/*#ifdef COMPARE_ALL
#ifdef AUTO_PART
#undef AUTO_PART
#endif
#ifdef SIMPLE_PART
#undef SIMPLE_PART
#endif
SumFoo sf1(a,n), sf2(a,n);
parallel_reduce(blocked_range(0,n), sf1, simple_partitioner());
cout<<"using SIMPLE_PART sum : "; sf1.displaysum();
parallel_reduce(blocked_range(0,n), sf2, auto_partitioner());
cout<<"using AUTO_PART sum : "; sf2.displaysum();
#endif
*/
SumFoo sf(a,n);
#ifdef AUTO_PART
parallel_reduce(blocked_range(0,n), sf, auto_partitioner());
cout<<"using AUTO_PART ";
#else
#ifdef SIMPLE_PART
parallel_reduce(blocked_range(0,n), sf, simple_partitioner());
cout<<"using SIMPLE_PART ";
#else
parallel_reduce(blocked_range(0,n,gs), sf);
cout<<"using GRAINSIZE = "< #endif
#endif
cout<<"sum : ";
sf.displaysum();
#ifdef DEBUG
sf.display_time();
#endif
}
#define SERIAL
#ifdef SERIAL
class SerialApplyFoo {
REAL *a;
size_t len;
double sum;
clock_t tserial;
public :
SerialApplyFoo(REAL *array, size_t length) : a(array), len(length){
cout<<"
------------------------ Summing using SerialApplyFoo ---------------------
";
&nb
sp; sum = 0.0;
tserial = -1;
while(tserial == -1) tserial = clock();
for(size_t i=0; i sum += a[i];
}
tserial = clock() - tserial;
}
void displaysum() {
cout<<"SerialApplyFoo sum : "< }
~SerialApplyFoo() {
displaysum();
cout<<"Approx time taken by SerialApplyFoo = "<"<<(double)tserial/CLOCKS_PER_SEC< }
};
#endif
int main() {
task_scheduler_init init;
REAL *array = new REAL[MAX];
if(array == NULL) {
cout<<"array == NULL
";
exit(0);
}
ParallelApplyFoo(array, MAX, GRAINSIZE);
#ifdef SERIAL
SerialApplyFoo sf(array, MAX);
#endif
ParallelSumFoo(array, MAX, GRAINSIZE);
return 0;
}



