>>...
>>double U0[n1][n2];
>>double U1[n1][n2];
>>double U2[n1][n2]; /* buffer*/
>>double *cU0[n1][n2];
>>/* simd */
>>__m128d vU0[n1][n2];
>>__m128d vU1[n1][n2];
>>__m128d vU2[n1][n2]; /* buffer*/
>>...
Could you try to declare all these buffers with a '__declspec( align( 16 ) )' specificator?





Using SIMD instructions to optimize Numeric 2D derivation
Hi,
I am trying to optimize a simple loop with SIMD SSE instructions. The code compute a 2D field and calculate heat propagation trough time.
Loops use only simple operations, and reference loop is really simple.
The following program contains 3 loops : loop 0 is the reference, loop 1 is a decomposition of the operations, and loop 2 is the optimize loop. However, the second loop is slightly faster than the first one, and the optimize loop is really slow. I am using x64 gcc 4.7 and icc 11 compilers, sames results with -O2, -O3, -msse2/-xSSE2.
Did I make something wrong ?
With my best regards.
Moomba
#include <stdio.h> #include <time.h> #include<emmintrin.h> #define n1 102 #define n2 102 #define niter 20000 double U0[n1][n2]; double U1[n1][n2]; double U2[n1][n2]; /* buffer*/ double *cU0[n1][n2]; /* simd */ __m128d vU0[n1][n2]; __m128d vU1[n1][n2]; __m128d vU2[n1][n2]; /* buffer*/ int i,j,t; double Dx,Dy,Lx,Ly,InvDxDx,InvDyDy,Dt,alpha,totaltime,Stab,DtAlpha,DxDx,DyDy; clock_t time0,time1; FILE *f1; int main() { /* ---- GENERAL ---- */ alpha = 0.4; totaltime = 1.0; Dt = totaltime/((niter-1)*1.0); Lx = 1.0; Ly = 1.0; Dx = Lx/((n1-1)*1.0); Dy = Ly/((n2-1)*1.0); InvDxDx = 1.0/(Dx*Dx); InvDyDy = 1.0/(Dy*Dy); DxDx = Dx*Dx; DyDy = Dy*Dy; Stab = alpha*Dt*(InvDxDx+InvDyDy); DtAlpha = Dt*alpha; /* Stability if result <= 0.5 */ printf("Stability factor : %f n",Stab); /* +----------------+ */ /* | LOOP 0 | */ /* +----------------+ */ /* Init */ for( i = 0; i < n1; i++) { for( j = 0; j < n2; j++) { U0[i][j] = 0.0; U1[i][j] = 0.0; } } for( i = 0; i < n1; i++) { U0[i][0] = 1.0; U1[i][0] = 1.0; } printf("Init OK n"); /* Core */ time0=clock(); for( t = 0; t < niter; t++) { /* even */ for( i = 1; i < n1-1; i++) { for( j = 1; j < n2-1; j++) { U1[i][j] = U0[i][j] + DtAlpha*( (U0[i+1][j]-2.0*U0[i][j]+U0[i-1][j])*InvDxDx + (U0[i][j+1]-2.0*U0[i][j]+U0[i][j-1])*InvDyDy); } } /* odd */ for( i = 1; i < n1-1; i++) { for( j = 1; j < n2-1; j++) { U0[i][j] = U1[i][j] + DtAlpha*( (U1[i+1][j]-2.0*U1[i][j]+U1[i-1][j])*InvDxDx + (U1[i][j+1]-2.0*U1[i][j]+U1[i][j-1])*InvDyDy); } } } time1=clock(); printf("Loop 0, total time : %f n", (double) time1-time0); f1 = fopen ("out0.dat", "wt"); for( i = 1; i < n1-1; i++) { for( j = 1; j < n2-1; j++) { fprintf (f1, "%dt%dt%fn", i, j, U0[i][j]); } } /* +----------------+ */ /* | LOOP 1 | */ /* +----------------+ */ /* Init */ for( i = 0; i < n1; i++) { for( j = 0; j < n2; j++) { U0[i][j] = 0.0; U1[i][j] = 0.0; } } for( i = 0; i < n1; i++) { U0[i][0] = 1.0; U1[i][0] = 1.0; } printf("Init OK n"); /* Core */ time0=clock(); for( t = 0; t < niter; t++) { /* even */ for( i = 1; i < n1-1; i++) { for( j = 1; j < n2-1; j++) { /* U1[i][j] = U0[i][j] + Dt*alpha*( (U0[i+1][j]-2.0*U0[i][j]+U0[i-1][j])*InvDxDx + (U0[i][j+1]-2.0*U0[i][j]+U0[i][j-1])*InvDyDy); */ U1[i][j] = -2.0*U0[i][j]; U1[i][j] = U1[i][j] + U0[i+1][j]; U1[i][j] = U1[i][j] + U0[i-1][j]; U1[i][j] = U1[i][j] * InvDxDx; U2[i][j] = -2.0*U0[i][j]; U2[i][j] = U2[i][j] + U0[i][j+1]; U2[i][j] = U2[i][j] + U0[i][j-1]; U2[i][j] = U2[i][j] * InvDyDy; U1[i][j] = U1[i][j] + U2[i][j]; U1[i][j] = U1[i][j] * DtAlpha; U1[i][j] = U1[i][j] + U0[i][j]; } } /* odd */ for( i = 1; i < n1-1; i++) { for( j = 1; j < n2-1; j++) { U0[i][j] = -2.0*U1[i][j]; U0[i][j] = U0[i][j] + U1[i+1][j]; U0[i][j] = U0[i][j] + U1[i-1][j]; U0[i][j] = U0[i][j] * InvDxDx; U2[i][j] = -2.0*U1[i][j]; U2[i][j] = U2[i][j] + U1[i][j+1]; U2[i][j] = U2[i][j] + U1[i][j-1]; U2[i][j] = U2[i][j] * InvDyDy; U0[i][j] = U0[i][j] + U2[i][j]; U0[i][j] = U0[i][j] * DtAlpha; U0[i][j] = U0[i][j] + U1[i][j]; } } } time1=clock(); printf("Loop 1, total time : %f n", (double) time1-time0); /* End */ f1 = fopen ("out1.dat", "wt"); for( i = 1; i < n1-1; i++) { for( j = 1; j < n2-1; j++) { fprintf (f1, "%dt%dt%fn", i, j, U0[i][j]); } } /* +----------------+ */ /* | LOOP 3 | */ /* +----------------+ */ /* Init */ for( i = 0; i < n1; i++) { for( j = 0; j < n2; j++) { vU0[i][j] = _mm_set1_pd(0.0); vU1[i][j] = _mm_set1_pd(0.0); } } for( i = 0; i < n1; i++) { vU0[i][0] = _mm_set1_pd(1.0); vU1[i][0] = _mm_set1_pd(1.0); } printf("Init OK n"); /* Core */ time0=clock(); for( t = 0; t < niter; t++) { /* even */ for( i = 1; i < n1-1; i++) { for( j = 1; j < n2-1; j+=2) { /* U1[i][j] = U0[i][j] + Dt*alpha*( (U0[i+1][j]-2.0*U0[i][j]+U0[i-1][j])*InvDxDx + (U0[i][j+1]-2.0*U0[i][j]+U0[i][j-1])*InvDyDy); */ __m128d va = _mm_set1_pd(-2.0); /* U1[i][j] = -2.0*U0[i][j]; */ vU1[i][j] = _mm_mul_pd(va,vU0[i][j]); vU1[i][j] = _mm_add_pd(vU1[i][j],vU0[i+1][j]); /* U1[i][j] = U1[i][j] + U0[i+1][j]; */ vU1[i][j] = _mm_add_pd(vU1[i][j],vU0[i-1][j]); /* U1[i][j] = U1[i][j] + U0[i-1][j]; */ __m128d vb = _mm_set1_pd(InvDxDx); /* U1[i][j] = U1[i][j] * InvDxDx; */ vU1[i][j] = _mm_mul_pd(vb,vU1[i][j]); vU2[i][j] = _mm_mul_pd(va,vU0[i][j]); /* U2[i][j] = -2.0*U0[i][j]; */ vU2[i][j] = _mm_add_pd(vU2[i][j],vU0[i+1][j]); /* U2[i][j] = U2[i][j] + U0[i+1][j]; */ vU2[i][j] = _mm_add_pd(vU2[i][j],vU0[i-1][j]); /* U2[i][j] = U2[i][j] + U0[i-1][j]; */ __m128d vc = _mm_set1_pd(InvDyDy); /* U2[i][j] = U2[i][j] * InvDyDy; */ vU2[i][j] = _mm_mul_pd(vc,vU2[i][j]); vU1[i][j] = _mm_add_pd(vU1[i][j],vU2[i][j]); /* U1[i][j] = U1[i][j] + U2[i][j]; */ __m128d vd = _mm_set1_pd(DtAlpha); /* U1[i][j] = U1[i][j] * DtAlpha; */ vU1[i][j] = _mm_mul_pd(vd,vU1[i][j]); vU1[i][j] = _mm_add_pd(vU1[i][j],vU0[i][j]); /* U1[i][j] = U1[i][j] + U0[i][j]; */ } } /* odd */ for( i = 1; i < n1-1; i++) { for( j = 1; j < n2-1; j++) { __m128d va = _mm_set1_pd(-2.0); /* U0[i][j] = -2.0*U1[i][j]; */ vU0[i][j] = _mm_mul_pd(va,vU1[i][j]); vU0[i][j] = _mm_add_pd(vU0[i][j],vU1[i+1][j]); /* U0[i][j] = U0[i][j] + U1[i+1][j]; */ vU0[i][j] = _mm_add_pd(vU0[i][j],vU1[i-1][j]); /* U0[i][j] = U0[i][j] + U1[i-1][j]; */ __m128d vb = _mm_set1_pd(InvDxDx); /* U0[i][j] = U0[i][j] * InvDxDx; */ vU0[i][j] = _mm_mul_pd(vb,vU0[i][j]); vU2[i][j] = _mm_mul_pd(va,vU1[i][j]); /* U2[i][j] = -2.0*U1[i][j]; */ vU2[i][j] = _mm_add_pd(vU2[i][j],vU1[i+1][j]); /* U2[i][j] = U2[i][j] + U1[i+1][j]; */ vU2[i][j] = _mm_add_pd(vU2[i][j],vU1[i-1][j]); /* U2[i][j] = U2[i][j] + U1[i-1][j]; */ __m128d vc = _mm_set1_pd(InvDyDy); /* U2[i][j] = U2[i][j] * InvDyDy; */ vU2[i][j] = _mm_mul_pd(vc,vU2[i][j]); vU0[i][j] = _mm_add_pd(vU0[i][j],vU2[i][j]); /* U0[i][j] = U0[i][j] + U2[i][j]; */ __m128d vd = _mm_set1_pd(DtAlpha); /* U0[i][j] = U0[i][j] * DtAlpha; */ vU0[i][j] = _mm_mul_pd(vd,vU0[i][j]); vU0[i][j] = _mm_add_pd(vU0[i][j],vU1[i][j]); /* U0[i][j] = U0[i][j] + U1[i][j]; */ } } } time1=clock(); printf("Loop 3, total time : %f n", (double) time1-time0); /* End */ f1 = fopen ("out3.dat", "wt"); for( i = 1; i < n1-1; i++) { for( j = 1; j < n2-1; j+=2) { fprintf (f1, "%dt%dt%fn", i, j, vU0[i][j]); } } }