Hello,

Recently, I started learning TBB. I tried to implement RBF calculation using the following program. Unfortunately, but working on 4 cores obtained performance similar to or worse than serial programs. I will be grateful for a hint on how to improve the program.

Thanks,

Stan

#include <iostream>

#include "parallel_1.h"

#include <tbb\tick_count.h>

#include <tbb\parallel_reduce.h>

#include <tbb\task_group.h>

#include <mkl_vml.h>

#include <mkl_blas.h>

#define GRAIN_SIZE 1000

#define N 100

#define M 200

const int DIM = 200*100;

class Norm2 {

float *_a, *_b;

double _ss;

public:

Norm2(float *a, float *b) : _a(a), _b(b), _ss(0) {}

Norm2( Norm2& x, split ) : _a(x._a), _b(x._b), _ss(0) {}

void operator( )( const blocked_range<size_t>& r )

{

for( size_t i=r.begin(); i!=r.end( ); i++ ){

_ss += (double)(_b[i]-_a[i])*(_b[i]-_a[i]);

}

}

void join( const Norm2& y ) { _ss += y._ss; }

double resut(){ return sqrt(_ss); }

};

class Gauss{

int _n;

float *_x, *_y;

float *_a;

public:

Gauss(int n, float *x, float *a, float *y)

: _n(n), _x(x), _a(a), _y(y) { }

void operator()(const blocked_range<size_t>& r) const {

for(size_t i =r.begin(); i<r.end(); i++) {

Norm2 d(_x, &_a[_n*i]);

parallel_reduce(blocked_range<size_t>(0, _n, GRAIN_SIZE), d, auto_partitioner());

_y[i] = (float)exp(-d.resut()*d.resut()/2.0);

}

}

};

void serialGauss(float* x, float *a, float *y, int m, int n )

{

double ss;

for(int i=0; i<m; i++){

ss = 0;

for(int j=0; j<n; j++)

ss +=(a[n*i+j]-x[j])*(a[n*i+j]-x[j]);

ss = sqrt(ss);

y[i] = (float)exp(-ss*ss/2.0);

}

}

void mklGauss(float* x, float *a, float *y, int m, int n )

{

float ss;

int one = 1;

float *v = (float*)malloc(n*sizeof(float));

for(int i=0; i<m; i++) {

vsSub(n, x, &a[m*i], v);

ss = snrm2(&n, v, &one);

y[i] = (float)exp(-ss*ss/2.0);

}

free(v);

}

int main(int argc, char *argv[])

{

int i, j;

float a[DIM];

float x[N];

float y[M];

int m = M;

int n = N;

for(i=0; i<M; i++) {

for(j=0; j<N; j++) a[N*i+j] = 2.5f;

}

for(i=0; i<N; i++) x[i] =2.2;

tick_count t0, t1;

t0 = tick_count::now();

serialGauss(x, a, y, M, N);

t1 = tick_count::now();

printf("serial: %.3g msec\n", (t1-t0).seconds()*1000);

for(i=0; i<10; i++) printf("%8.4f", y[i]);

printf("\n");

//

for(i=0; i<M; i++) y[i] =0;

t0 = tick_count::now();

mklGauss(x, a, y, M, N);

t1 = tick_count::now();

printf("mkl: %.3f msec\n", (t1-t0).seconds()*1000);

for(i=0; i<10; i++) printf("%8.4f", y[i]);

printf("\n");

//

t0 = tick_count::now();

parallel_for(blocked_range<size_t>(0, m, GRAIN_SIZE), Gauss(n, x, a, y), auto_partitioner());

t1 = tick_count::now();

printf("parallel: %.3f msec\n", (t1-t0).seconds()*1000);

for(i=0; i<10; i++) printf("%8.4f", y[i]);

printf("\n");

return 0;

}

serial: 0.899 msec

0.0111 0.0111 0.0111 0.0111 0.0111 0.0111 0.0111 0.0111 0.0111 0.0111

mkl: 19.049 msec

0.0111 0.0111 0.0111 0.0111 0.0111 0.0111 0.0111 0.0111 0.0111 0.0111

parallel: 0.967 msec

0.0111 0.0111 0.0111 0.0111 0.0111 0.0111 0.0111 0.0111 0.0111 0.0111