Hello All,
I am facing an issue with parallel for. I am using parallel_for for some particular operation and Iam not able to get any speedup from the TBB code.
[bash]class Distance_Global { public: float* diff_array; float* temp1; float* query1; void operator()(const blocked_range& r) const { float *temp_1 = temp1; float *query_1 = query1; int end=r.end(); for (int i=r.begin();i!=end;++i){ diff_array[i]=(temp_1[i] - query_1[i])*(temp_1[i] - query_1[i]); } } }; int main (int argc, char ** argv) { int numElements = 19800; int GRAIN = 1000; if (argc == 3) { numElements = atoi(argv[1]); GRAIN = atoi(argv[2]); } cout << "Running with #Elements : " << numElements << " And GRAIN : " << GRAIN << endl; float out1[numElements], out2[numElements],diff_array[numElements]; for (int i=0; i < numElements; ++i) { out1[i] = 1.5345; out2[i] = 0.8976; } tick_count t0 = tick_count::now( ); Distance_Global dg; dg.diff_array=diff_array; dg.temp1=out1; dg.query1=out2; parallel_for(blocked_range(0,numElements,GRAIN),dg); tick_count t1 = tick_count::now( ); cout<<"Parall: "<<(t1-t0).seconds()< My number of elements numElements are fixed and == 19800.
I played with GRAIN size but it does not help me,
My TBBversion code takes almost 10 times the time taken by serial code. This is quite shocking.
Am I doing some mistake in the coding or what should I do to speeed up the things.
Thanks


