Hi,

I have an i7 965 cpu running on win7 and using tbb 3.0.

The attached program, is designed to gather into a (clumn format) matrix elements that live in different buffers

(in this case a bigger matrix).

I am trying to see how many processing units (out of 8?) are engaged in the calculation. From the task mgr

I see at most 4.

//stl #include using std::vector; //tbb //tbb #include "tbbparallel_for.h" #include "tbbparallel_reduce.h" #include "tbbblocked_range.h" #include "tbbblocked_range2d.h" // function to gather data into a single matrix as columns template< typename _M, typename _T> void gather_data_into_matrix_columns( _M & m, vector<_T*> const & dataStart, vector const & dataStride ){ struct column_gatherer{ _M & m_; vector<_T*> const & dataStart_; vector const & dataStride_; column_gatherer( _M & m, vector<_T*> const & dataStart, vector const & dataStride ) :m_(m), dataStart_(dataStart), dataStride_(dataStride) {} void operator()( const tbb::blocked_range2d< size_t > & r ) const { size_t rowStart = r.rows().begin(); size_t rowEnd = r.rows().end(); size_t colStart = r.cols().begin(); size_t colEnd = r.cols().end(); for ( size_t colIdx = colStart; colIdx != colEnd; ++colIdx ){ size_t stride = dataStride_[colIdx]; _T const * ptr = dataStart_[colIdx] + rowStart * stride; for ( size_t rowIdx = rowStart; rowIdx != rowEnd; ++rowIdx ){ m_( rowIdx, colIdx ) = *ptr; ptr += stride; } } } }; static const size_t GRAIN_SIZE_ROW = 10; static const size_t GRAIN_SIZE_COLUMN = 10; column_gatherer cg( m, dataStart, dataStride ); tbb::parallel_for( tbb::blocked_range2d( 0, m.size1(), GRAIN_SIZE_ROW, 0, m.size2(), GRAIN_SIZE_COLUMN ), cg, tbb::auto_partitioner() ); } #include using std::cout; using std::endl; #include using boost::numeric::ublas::matrix; bool test_GatherScatter(){ const size_t nRows = 80; const size_t nCols = 20; matrix< double, boost::numeric::ublas::column_major> m(nRows,nCols); const size_t nRowsBig = 800; const size_t nColsBig = 100; matrix< double, boost::numeric::ublas::column_major> mBig(nRowsBig,nColsBig); for ( size_t i = 0; i != nRowsBig; ++i ){ for ( size_t j = 0; j != nColsBig; ++j ){ mBig(i,j) = double(rand())/double(RAND_MAX) - 0.5L; } } vector< double *> colStart( nCols ); vector< size_t > strides( nCols ); for ( size_t i = 0; i != nCols; ++i ) { colStart[i] = &(mBig(0,3*i)); strides[i] = 5; } gather_data_into_matrix_columns( m, colStart, strides ); for( size_t i = 0; i != nRows; ++i ){ for( size_t j = 0; j != nCols; ++j ){ if ( m(i,j) != mBig( 5*i, 3*j) ){ cout << "(i, j) = (" << i << ", " << j << ")" << endl; return false; } } } return true; } int main(){ bool b; for ( size_t i = 0; i != 1000; ++i ) b = test_GatherScatter(); }

Is this a reliable test? Is there a better one than eyeballing it ;)) ?

The code is a concatenation of various files. Conceptually gives the right result. Please, let me know if there is something more you might need.

Thank you very much for your help,

Petros

ps: the motivation for this is to bring into one place (typically a matrix) data that will constitute the rhs of lapack linear equation solver.