Hi All.
I am new with SIMD programming, had a peice of code as below -
--
#include
#include
#include
#include
// Set up a vector type for a float[4] array for each vector type
typedef __m128 vFloat;
// Also define some macros to map a virtual SIMD language to
// each actual SIMD language.
// Note that because i MUST be an immediate, it is incorrect here
// to alias i to a stackbased copy and replicate that 4 times.
#define vSplat( v, i ) ({ __m128 a = v; a = _mm_shuffle_ps( a, a, _MM_SHUFFLE(i,i,i,i) ); a; })
inline __m128 vMADD( __m128 a, __m128 b, __m128 c )
{
return _mm_add_ps( c, _mm_mul_ps( a, b ) );
}
#define vLoad( ptr ) _mm_load_ps( (float*) (ptr) )
#define vStore( v, ptr ) _mm_store_ps( (float*) (ptr), v )
#define vZero() _mm_setzero_ps()
#define empty() _m_empty(); //clears the SSE2 registers and SSE2 state.
// Prototype for a vector matrix multiply function
void MyMatrixMultiply( vFloat A[4], vFloat B[4], vFloat C[4] );
int main( void )
{
// The vFloat type (defined previously) is a vector or scalar array that contains 4 floats
// Thus each one of these is a 10x10 matrix, stored in the C storage order.
vFloat A[10];
vFloat B[10];
vFloat C1[10];
vFloat C2[10];
int i, j, k;
// Pointers to the elements in A, B, C1 and C2
float *a = (float*) &A;
float *b = (float*) &B;
float *c1 = (float*) &C1;
float *c2 = (float*) &C2;
// Initialize the data
for( i = 0; i < 100; i++ )
{
a[i] = (double) (rand() - RAND_MAX/2) / (double) (RAND_MAX );
b[i] = (double) (rand() - RAND_MAX/2) / (double) (RAND_MAX );
c1[i] = c2[i] = 0.0;
}
// Perform the brute-force version of matrix multiplication and use this later to check for correctness
printf( "Doing simple matrix multiply...\n" );
for( i = 0; i < 10; i++ )
for( j = 0; j < 10; j++ )
{
float result = 0.0f;
for( k = 0; k < 10; k++ )
result += a[ i * 10 + k] * b[ k * 10 + j ];
c1[ i * 10 + j ] = result;
}
// The vector version
printf( "Doing vector matrix multiply...\n" );
MyMatrixMultiply( A, B, C2 );
// Make sure that the results are correct
// Allow for some rounding error here
printf( "Verifying results..." );
for( i = 0 ; i < 100; i++ )
if( fabs( c1[i] - c2[i] ) > 1e-6 )
printf( "failed at %i,%i: %8.17g %8.17g\n", i/4, i&3, c1[i], c2[i] );
printf( "done.\n" );
return 0;
}
void MyMatrixMultiply( vFloat A[16], vFloat B[16], vFloat C[16] )
{
vFloat A1 = vLoad( A ); //Row 1 of A
vFloat A2 = vLoad( A + 1 ); //Row 2 of A
vFloat A3 = vLoad( A + 2 ); //Row 3 of A
vFloat A4 = vLoad( A + 3); //Row 4 of A
vFloat C1 = vZero(); //Row 1 of C, initialized to zero
vFloat C2 = vZero(); //Row 2 of C, initialized to zero
vFloat C3 = vZero(); //Row 3 of C, initialized to zero
vFloat C4 = vZero(); //Row 4 of C, initialized to zero
vFloat B1 = vLoad( B ); //Row 1 of B
vFloat B2 = vLoad( B + 1 ); //Row 2 of B
vFloat B3 = vLoad( B + 2 ); //Row 3 of B
vFloat B4 = vLoad( B + 3); //Row 4 of B
//Multiply the first row of B by the first column of A (do not sum across)
C1 = vMADD( vSplat( A1, 0 ), B1, C1 );
C2 = vMADD( vSplat( A2, 0 ), B1, C2 );
C3 = vMADD( vSplat( A3, 0 ), B1, C3 );
C4 = vMADD( vSplat( A4, 0 ), B1, C4 );
// Multiply the second row of B by the second column of A and
// add to the previous result (do not sum across)
C1 = vMADD( vSplat( A1, 1 ), B2, C1 );
C2 = vMADD( vSplat( A2, 1 ), B2, C2 );
C3 = vMADD( vSplat( A3, 1 ), B2, C3 );
C4 = vMADD( vSplat( A4, 1 ), B2, C4 );
// Multiply the third row of B by the third column of A and
// add to the previous result (do not sum across)
C1 = vMADD( vSplat( A1, 2 ), B3, C1 );
C2 = vMADD( vSplat( A2, 2 ), B3, C2 );
C3 = vMADD( vSplat( A3, 2 ), B3, C3 );
C4 = vMADD( vSplat( A4, 2 ), B3, C4 );
// Multiply the fourth row of B by the fourth column of A and
// add to the previous result (do not sum across)
C1 = vMADD( vSplat( A1, 3 ), B4, C1 );
C2 = vMADD( vSplat( A2, 3 ), B4, C2 );
C3 = vMADD( vSplat( A3, 3 ), B4, C3 );
C4 = vMADD( vSplat( A4, 3 ), B4, C4 );
// Write out the result to the destination
vStore( C1, C );
vStore( C2, C + 1 );
vStore( C3, C + 2 );
vStore( C4, C + 3 );
empty(); //clears the SSE2 registers and SSE2 state.
}
------
I have following queries though it could be simple for others, they are -
(a) What does complete content in L#14 mean?
-----
#define vSplat( v, i ) ({ __m128 a = v; a = _mm_shuffle_ps( a, a, _MM_SHUFFLE(i,i,i,i) ); a; })
-----
Could it be elaborated with sequences as mentioned below for above MACRO.
"__m128 a = v;
a = _mm_shuffle_ps( a, a, _MM_SHUFFLE(i,i,i,i) );
a;
(b) In L#25, I had defined "_m_empty()" for SSe2 which is being called in MyMatrixMultiply() API at the end,
does use of it at the end can improve performance or simply not needed for SSE2?
(c) Within this code, can I perform -
(i) Vectorization with Compiler FLAGS(Intel/GNU) for FOR loops for L#56 - 63?
(ii) This code has been written to address SSE2, could it be checked for x87 FP operations registers too, if YES, how?
(iii) Can a comparison be generated both for x87 & SSE2 FP registers in performances, if YES - how it can be performed & analyzed?
Appreciate you responses for above.
I do refer Intel C++ Intrinisic Reference & Intel C++ Compiler documents.
~BR




