# By using TBB, speed(performance) is decreased. why ?

## By using TBB, speed(performance) is decreased. why ?

For multicore programming by TBB, i tested the following code under the this environment (TBB 2.1, Intel C++ Compiler 11.x, Visual Studio 2008, Intel Core 2 Quad 8200, Windows XP)

[CODE C]
#include
#include tbb/parallel_for.h
#include tbb/blocked_range2d.h
#include tbb/tick_count.h
#include tbb/partitioner.h

using namespace tbb;

const size_t L = 200;
const size_t M = 200;
const size_t N = 200;

void SerialMatrtixMultiply(float c[][N], float a[][L], float b[][N]){
for(size_t i = 0; i < M; ++i){
for(size_t j = 0; j < N; ++j){
float sum = 0;
for(size_t k = 0; k < L; ++k)
sum += a[i][k] * b[k][j];
c[i][j] = sum;
}
}
}

class MatrixMultiply2D{
float (*my_a)[L];
float (*my_b)[N];
float (*my_c)[N];

public:
void operator()(const blocked_range2d& r) const {
float (*a)[L] = my_a;
float (*b)[N] = my_b;
float (*c)[N] = my_c;

for(size_t i = r.rows().begin(); i != r.rows().end(); ++i){
for(size_t j = r.cols().begin(); j != r.cols().end(); ++j){
float sum = 0;
for(size_t k = 0; k < L; ++k)
sum += a[i][k] * b[k][j];
c[i][j] = sum;
}
}
}
MatrixMultiply2D(float c[][N], float a[][L], float b[][N]):my_a(a), my_b(b), my_c(c)
{}
};

void ParallelMatrixMultiply(float c[][N], float a[][L], float b[][N]){
parallel_for(blocked_range2d(0, M, 0, N), MatrixMultiply2D(c,a,b), auto_partitioner());
}

int main(void){

float a[M][L];
float b[L][N];
float c[M][N];

srand(time(NULL));
for(int i = 0;i < M;i++){
for(int j = 0;j < L;j++)
a[i][j] = rand() % 30;
}

for(int i = 0;i < L;i++){
for(int j = 0;j < N;j++)
b[i][j] = rand() % 30;
}

tick_count t0 = tick_count::now();
SerialMatrtixMultiply(c,a,b);
tick_count t1 = tick_count::now();
std::cout << seq eslaped time : << (t1 t0).seconds() << std::endl;

t0 = tick_count::now();
ParallelMatrixMultiply(c,a,b);
t1 = tick_count::now();
std::cout << parallel eslaped time : << (t1 t0).seconds() << std::endl;

return 0;
}
[/CODE]

The elapsed time :

seq elapsed time : 0.04437542 (s)
parallel elapsed time : 0.0111000 (s)

Parallel case is faster than sequential case.

But, when i removed "task_scheduler_init init;" and "ParallelMatrixMultiply(c,a,b);" , namely only running SerialMatrtixMultiply(c,a,b), elapsed time is 0.0000636 (s).

This is very much faster than TBB case.

Why appears this strange?