variable size two dimensional matrix multiplication

variable size two dimensional matrix multiplication

Could anyone show me how to implement two-dimensional matrix multiplication with variable size 2D arrays? The example illustrated in Intel TBB book uses fixed size arrays. Thanks!

Long

16 posts / 0 new
Last post
For more complete information about compiler optimizations, see our Optimization Notice.

Representations of 2D variable-size matrices very widely in C++. If you sketch your preferred approach for serial multiplication of variable-size matrices, then perhaps someone can post the TBBified version.

- Arch Robison

Here is the code from Intel TBB book that I've modified to use Numerical Recipe C++ matrix class, Mat_SP ( matrix single precision). The code compiles and executes, but calculations from serial and parallel portionsof the code do not match. I think the code using NR pass in matrix data ok but having problem returning matrix data back to the main program. Your help is greatly appreciated!

Long

p.s. I think the problem occurs within operator() code(i.e, Mat_SP c = my_c, this creates a new copy of matrix c insteads of the original matrix c; I have no idea how to pass this new copy of the matrix c back to the main program). Thanks in advance for any suggestion!

//

// Copyright 2007 Intel Corporation. All Rights Reserved.

//

#include

#include

#include

#include

#include "tbb/task_scheduler_init.h"

#include "tbb/tick_count.h"

// C++ numerical recipe header

#include "nr.h"

#include "tbb/parallel_for.h"

#include "tbb/blocked_range2d.h"

using namespace tbb;

using namespace std;

//! Working threads count for parallel version

static int NThread = 2;

//const size_t L = 150;

//const size_t M = 225;

//const size_t N = 300;

//void SerialMatrixMultiply( float c[M][N], float a[M][L], float b[L][N] ) {

void SerialMatrixMultiply( Mat_SP &c, Mat_SP a, Mat_SP b ) {

for( int i=0; i

for( int j=0; j

float sum = 0;

for( int k=0; k

sum += a[i][k]*b[k][j];

c[i][j] = sum;

}

}

}

/
*

class MatrixMultiplyBody2D {

float (*my_a)[L];

float (*my_b)[N];

float (*my_c)[N];

public:

void operator()( const blocked_range2d& r ) const {

float (*a)[L] = my_a; // a,b,c used in example to emphasize

float (*b)[N] = my_b; // commonality with serial code

float (*c)[N] = my_c;

for( size_t i=r.rows().begin(); i!=r.rows().end(); ++i ){

for( size_t j=r.cols().begin(); j!=r.cols().end(); ++j ) {

float sum = 0;

for( size_t k=0; k

sum += a[i][k]*b[k][j];

c[i][j] = sum;

}

}

}

// MatrixMultiplyBody2D( float c[M][N], float a[M][L], float b[L][N] ) :

MatrixMultiplyBody2D( Mat_SP c, Mat_SP a, Mat_SP b ) :

my_a(a), my_b(b), my_c(c)

{}

};

*/

class MatrixMultiplyBody2D {

Mat_SP my_a, my_b, my_c;

public:

void operator()( const blocked_range2d<int>& r ) const {

Mat_SP a = my_a; // a,b,c used in example to emphasize

Mat_SP b = my_b; // commonality with serial code

Mat_SP c = my_c;

for( int i=r.rows().begin(); i!=r.rows().end(); ++i ){

for( int j=r.cols().begin(); j!=r.cols().end(); ++j ) {

float sum = 0;

for( int k=0; k

sum += a[i][k]*b[k][j];

c[i][j] = sum;

}

}

}

// MatrixMultiplyBody2D( float c[M][N], float a[M][L], float b[L][N] ) :

MatrixMultiplyBody2D( Mat_SP &c, Mat_SP a, Mat_SP b ) :

my_a(a), my_b(b), my_c(c)

{}

};

//void ParallelMatrixMultiply(float c[M][N], float a[M][L], float b[L][N]){

void ParallelMatrixMultiply( Mat_SP &c, Mat_SP a, Mat_SP b){

parallel_for( blocked_range2d<int>(0, a.nrows(), 16, 0, b.ncols(), 32),

MatrixMultiplyBody2D(c,a,b) );

}

//static void CreateData( float a[M][L], float b[L][N] ) {

static void CreateData( Mat_SP &a, Mat_SP &b) {

for( int i=0; i

for( <
/FONT>int j=0; j

a[i][j] = (rand()%1000)/100.0f;

}

}

for( int i=0; i

for( int j=0; j

b[i][j] = (rand()%1000)/100.0f;

}

}

}

//static bool VerifyResult( float c1[M][N], float c2[M][N] ) {

static bool VerifyResult(Mat_SP c1, Mat_SP c2 ) {

for( int i=0; i

for( int j=0; j

if ( abs(c1[i][j] - c2[i][j]) > 0.0001f ) {

return false;

}

}

}

return true;

}

static void ParseCommandLine( int argc, char* argv[] ) {

int i = 1;

if( i

fprintf(stderr,"Usage: %s number-of-threads
",argv[0]);

exit(1);

}

if( i

}

int main( int argc, char* argv[] ) {

srand(2);

ParseCommandLine( argc, argv );

int L,M,N;

cout << "please enter L,M,N (ex. 150 225 300) = " << endl;

cin >> L >> M >> N;

//const size_t L = 150;

//const size_t M = 225;

//const size_t N = 300;

task_scheduler_init init(NThread);

// float a[M][L];

// float b[L][N];

// using numerical recipe matrix class

Mat_SP a(M,L);

Mat_SP b(L,N);

CreateData(a, b);

// float c1[M][N];

// using numerical recipe matrix class

Mat_SP c1(M,N);

tick_count t0 = tick_count::now();<
/P>

SerialMatrixMultiply(c1, a, b);

tick_count t1 = tick_count::now();

printf("Serial code: %.4f sec
", (t1-t0).seconds());

// float c2[M][N];

// using numerical recipe matrix class

Mat_SP c2(M,N);

t0 = tick_count::now();

ParallelMatrixMultiply(c2, a, b);

t1 = tick_count::now();

printf("Parallel code (%d threads): %.4f sec
",

NThread, (t1-t0).seconds());

if ( VerifyResult(c1, c2) ) {

printf("Results match!
");

return 0;

} else {

printf("Results don't match!
");

return 2;

}

}

The problem was values versus references. Each time a Mat_SP is passed by value, a copy is made. That's unnecessary expense for matrices a and b, and as you noted, gives wrong answers for matrix c.

The solution is to pass a and b by const reference, and pass c by reference.Appended isa diff that shows the necessary changes. The changes are all in MatrixMultiplyBody2D:

  1. Changemy_a, my_b, and my_c to references. Fields my_a and my_b, because they refer to input arguments, should be const references.
  2. Pass the arguments to the constructor by reference also, with const qualifiers on the input arguments.

With these changes, the results match.

- Arch Robison

*** matrix.cpp.orig 2007-10-04 08:48:52.936144000 -0500
--- matrix.cpp.fixed 2007-10-04 09:08:05.805223000 -0500
***************
*** 29,54 ****
}
}
}
 class MatrixMultiplyBody2D {
! Mat_SP my_a, my_b, my_c;
public:
void operator()( const blocked_range2d& r ) const {
! Mat_SP a = my_a; // a,b,c used in example to emphasize
! Mat_SP b = my_b; // commonality with serial code
! Mat_SP c = my_c;
for( int i=r.rows().begin(); i!=r.rows().end(); ++i ){
for( int j=r.cols().begin(); j!=r.cols().end(); ++j ) {
float sum = 0;
for( int k=0; k sum += a[i][k]*b[k][j];
c[i][j] = sum;
}
}
}
! MatrixMultiplyBody2D( Mat_SP &c, Mat_SP a, Mat_SP b ) :
my_a(a), my_b(b), my_c(c)
{}
};
 void ParallelMatrixMultiply( Mat_SP &c, Mat_SP a, Mat_SP b){
--- 29,55 ----
}
}
}
 class MatrixMultiplyBody2D {
! const Mat_SP &my_a, &my_b;
! Mat_SP &my_c;
public:
void operator()( const blocked_range2d& r ) const {
! const Mat_SP& a = my_a; // a,b,c used in example to emphasize
! const Mat_SP& b = my_b; // commonality with serial code
!& nbsp; Mat_SP& c = my_c;
for( int i=r.rows().begin(); i!=r.rows().end(); ++i ){
for( int j=r.cols().begin(); j!=r.cols().end(); ++j ) {
float sum = 0;
for( int k=0; k sum += a[i][k]*b[k][j];
c[i][j] = sum;
}
}
}
! MatrixMultiplyBody2D( Mat_SP &c, const Mat_SP& a, const Mat_SP& b ) :
my_a(a), my_b(b), my_c(c)
{}
};
 void ParallelMatrixMultiply( Mat_SP &c, Mat_SP a, Mat_SP b){

For the record, appended is thecomplete code that I ran successfully, with the commented-out code removed.

I discovereda quirkabout this forum's text editor. If I copy-and-paste the code directly from Visual Studio, all the indentation is lost. But if I copy-and-past into Word first, and then copy-and-paste from Word, the indentation is retained.

- Arch Robison

#include

#include

#include

#include

#include

// TBB headers

#include "tbb/task_scheduler_init.h"

#include "tbb/tick_count.h"

#include "tbb/parallel_for.h"

#include "tbb/blocked_range2d.h"

// C++ numerical recipe header

#include "nr.h"

using namespace tbb;

using namespace std;

//! Working threads count for parallel version

static int NThread = 2;

void SerialMatrixMultiply( Mat_SP &c, Mat_SP a, Mat_SP b ) {

for( int i=0; i

for( int j=0; j

float sum = 0;

for( int k=0; k

sum += a[i][k]*b[k][j];

c[i][j] = sum;

}

}

}

class MatrixMultiplyBody2D {

const Mat_SP &my_a, &my_b;

Mat_SP &my_c;

public:

void operator()( const blocked_range2d<int>& r ) const {

const Mat_SP& a = my_a; // a,b,c used in example to emphasize

const Mat_SP& b = my_b; // commonality with serial code

Mat_SP& c = my_c;

for( int i=r.rows().begin(); i!=r.rows().end(); ++i ){

for( int j=r.cols().begin(); j!=r.cols().end(); ++j ) {

float sum = 0;

for( int k=0; k

sum += a[i][k]*b[k][j];

c[i][j] = sum;

}

}

}

MatrixMultiplyBody2D( Mat_SP &c, const Mat_SP& a, const Mat_SP& b ) :

my_a(a), my_b(b), my_c(c)

{}

};

void ParallelMatrixMultiply( Mat_SP &c, Mat_SP a, Mat_SP b){

parallel_for( blocked_range2d<int>(0, a.nrows(), 16, 0, b.ncols(), 32),

MatrixMultiplyBody2D(c,a,b) );

}

static void CreateData( Mat_SP &a, Mat_SP &b) {

for( int i=0; i

for( int j=0; j

a[i][j] = (rand()%1000)/100.0f;

}

}

for( int i=0; i

for( int j=0; j

b[i][j] = (rand()%1000)/100.0f;

}

}

}

static bool VerifyResult(Mat_SP c1, Mat_SP c2 ) {

for( int i=0; i

for( int j=0; j

if ( abs(c1[i][j] - c2[i][j]) > 0.0001f ) {

return false;

}

}

}

return true;

}

static void ParseCommandLine( int argc, char* argv[] ) {

int i = 1;

if( i

fprintf(stderr,"Usage: %s number-of-threads ",argv[0]);

exit(1);

}

if( i

}

int main( int argc, char* argv[] ) {

srand(2);

ParseCommandLine( argc, argv );

int L,M,N;

cout << "please enter L,M,N (ex. 150 225 300) = " << endl;

cin >> L >> M >> N;

task_scheduler_init init(NThread);

// using numerical recipe matrix class

Mat_SP a(M,L);

Mat_SP b(L,N);

CreateData(a, b);

// using numerical recipe matrix class

Mat_SP c1(M,N);

tick_count t0 = tick_count::now();

SerialMatrixMultiply(c1, a, b);

tick_count t1 = tick_count::now();

printf("Serial code: %.4f sec ", (t1-t0).seconds());

// using numerical recipe matrix class

Mat_SP c2(M,N);

t0 = tick_count::now();

ParallelMatrixMultiply(c2, a, b);

t1 = tick_count::now();

printf("Parallel code (%d threads): %.4f sec ",

NThread, (t1-t0).seconds());

if ( VerifyResult(c1, c2) ) {

printf("Results match! ");

return 0;

} else {

printf("Results don't match! ");

return 2;

}

}

Hello Arch,

Thanks very much for your help, and your explanation of the 'const ref' makes sense; it clearly shows how to properly use the parallel_for routine.

Best regards,

Long

It's hard to learn and apply something new so that it would work efficiently and correctly. I hope my experience on using tbb will help others.

Thanks for the tbb.dll and the help of Intel TBB developers, my image edit and reconstrunction program which operates on large 2d matriceshas more than double its execution speed. I also notice that by replacing the default new and delete operators used in NR's Mat_DP with TBB book's new and delete operators, I can furthur improve the program significantly on my duo core laptop.

Reason for using TBB book's new/delete operators was due to mytesting that the same program ran twice as fast on my laptop was more than 4x slow down on my 4 processors Xeon desktop machine at work. I have not tried the new version using tbbmalloc.dll on my work desktop yet. I will report the result here nextweekif no one here object.

Long

Hello,

Tested this morning at work, the new version of my application using tbbmalloc.dll run twice as fast as my duo core laptop. I compiled the application using ms visual studio2005 professional (I guess their memory allcation routine is not optimal for parallel processing.)with autopartioned grain size.Btw evenon single processor machine, TBB actually speeds up the application.

Regards,

Long

It's quite possible for TBB's parallel_for to speed up a matrix multiply even on a single processor, because of cache blocking effects. Can you describe the typical dimensions of the matrices that you are multiplying? Knowing that, I or someone else on this thread can suggest potentially faster reordings of the loops.

The 2D matrix multiplication application was just my testing to get aquainted with TBB library. The actual application I am working on is basically a 2D SAR (synthetic aperture radar) polar reformat application that computes 2D interpolations for data matrices as large as 2048x2048, sometime larger. It has two for loops with 2d interpolations done within inner loop. I am experimenting with different grain size to see if it would furthur speed up the processing. Thanks!

Long

Hi.

Im trying to get the example code from this post working, but I'm having some problems regarding blocked_range2d.
I couldn't just copy/paste the code because I don't have the NR library, so I made a few modifications and used newmat instead.

I compile with

g++ matrix_tbb.cpp -ltbb -lnewmat

and get the following error:

matrix_tbb.cpp: In member function void MatrixMultiplyBody2D::operator()(const tbb::blocked_range2d&) const:
matrix_tbb.cpp:34: error: r->tbb::blocked_range2d::rows [with RowValue = int, ColValue = int] does not have class type
matrix_tbb.cpp:35: error: r->tbb::blocked_range2d::cols [with RowValue = int, ColValue = int] does not have class type

The relevant code looks pretty much the same as the one posted above:

class MatrixMultiplyBody2D {
const Matrix &my_a, &my_b;
Matrix &my_c;

public:

void operator()(const blocked_range2d& r) const {
const Matrix& a = my_a;
const Matrix& b = my_b;
Matrix& c = my_c;

for(int i=r.rows().begin(); i!=r.rows.end(); ++i){
for(int j=r.cols().begin(); j!=r.cols.end(); ++j){
float sum = 0;
for(int k=1;k
sum += a(i,k)*b(k,j);
c(i,j) = sum;
}
}
}

void parallelMult(Matrix &c, const Matrix &a, const Matrix &b){
parallel_for( blocked_range2d(1, a.Nrows(), 16, 1, b.Ncols()+1, 32),
MatrixMultiplyBody2D(c,a,b) );
}

Im pretty new to c++ and have been trying to figure out what the whole 'does not have class type' thing means.
Any help would be appreciated.

rows and cols are member functions: you have to call them with end() just like you do with begin(), so adding two pairs of empty braces might be enough, but I have not checked that myself.

Quoting - Raf Schietekat
rows and cols are member functions: you have to call them with end() just like you do with begin(), so adding two pairs of empty braces might be enough, but I have not checked that myself.

Wow, must have been at it too long yesterday. That was the problem indeed.
Thanks so much!

I am new to TBB. In particular, I am curious of the usage of blocked_range2d in the parallel_for. It appears that the parallel_for for the matrix multiplication exploites tiling. If my guess is correct, then I wonder whey the column_range_size is 2 * row_range_size.

Quoting - Arch Robison (Intel)

The problem was values versus references. Each time a Mat_SP is passed by value, a copy is made. That's unnecessary expense for matrices a and b, and as you noted, gives wrong answers for matrix c.

The solution is to pass a and b by const reference, and pass c by reference.Appended isa diff that shows the necessary changes. The changes are all in MatrixMultiplyBody2D:

  1. Changemy_a, my_b, and my_c to references. Fields my_a and my_b, because they refer to input arguments, should be const references.
  2. Pass the arguments to the constructor by reference also, with const qualifiers on the input arguments.

With these changes, the results match.

- Arch Robison

*** matrix.cpp.orig 2007-10-04 08:48:52.936144000 -0500
--- matrix.cpp.fixed 2007-10-04 09:08:05.805223000 -0500
***************
*** 29,54 ****
}
}
}
 class MatrixMultiplyBody2D {
! Mat_SP my_a, my_b, my_c;
public:
void operator()( const blocked_range2d& r ) const {
! Mat_SP a = my_a; // a,b,c used in example to emphasize
! Mat_SP b = my_b; // commonality with serial code
! Mat_SP c = my_c;
for( int i=r.rows().begin(); i!=r.rows().end(); ++i ){
for( int j=r.cols().begin(); j!=r.cols().end(); ++j ) {
float sum = 0;
for( int k=0; k sum += a[i][k]*b[k][j];
c[i][j] = sum;
}
}
}
! MatrixMultiplyBody2D( Mat_SP &c, Mat_SP a, Mat_SP b ) :
my_a(a), my_b(b), my_c(c)
{}
};
 void ParallelMatrixMultiply( Mat_SP &c, Mat_SP a, Mat_SP b){
--- 29,55 ----
}
}
}
 class MatrixMultiplyBody2D {
! const Mat_SP &my_a, &my_b;
! Mat_SP &my_c;
public:
void operator()( const blocked_range2d& r ) const {
! const Mat_SP& a = my_a; // a,b,c used in example to emphasize
! const Mat_SP& b = my_b; // commonality with serial code
!& nbsp; Mat_SP& c = my_c;
for( int i=r.rows().begin(); i!=r.rows().end(); ++i ){
for( int j=r.cols().begin(); j!=r.cols().end(); ++j ) {
float sum = 0;
for( int k=0; k sum += a[i][k]*b[k][j];
c[i][j] = sum;
}
}
}
! MatrixMultiplyBody2D( Mat_SP &c, const Mat_SP& a, const Mat_SP& b ) :
my_a(a), my_b(b), my_c(c)
{}
};
 void ParallelMatrixMultiply( Mat_SP &c, Mat_SP a, Mat_SP b){

"I am new to TBB. In particular, I am curious of the usage of blocked_range2d in the parallel_for. It appears that the parallel_for for the matrix multiplication exploites tiling."
Each tile in the product results from two corresponding bands in the multiplicands, which can be a lot faster, I suppose, than a naive application of the matrix product definition if the bands and the tile fit in the cache but not the three matrices.

"If my guess is correct, then I wonder whey the column_range_size is 2 * row_range_size."
I don't see what you mean?

I see the forum has decided to leave out the editing option today. :-)

Sorry, I have to correct myself, the naive definition would not require all three matrices to fit, either. Probably "just" one of the matrices plus a row/column in the other, but that's already quadratic compared to two bands and a tile.

Leave a Comment

Please sign in to add a comment. Not a member? Join today