Using SIMD instructions to optimize Numeric 2D derivation

Using SIMD instructions to optimize Numeric 2D derivation

Hi,

I am trying to optimize a simple loop with SIMD SSE instructions. The code compute a 2D field and calculate heat propagation trough time.

Loops use only simple operations, and reference loop is really simple.

The following program contains 3 loops : loop 0 is the reference, loop 1 is a decomposition of the operations, and loop 2 is the optimize loop. However, the second loop is slightly faster than the first one, and the optimize loop is really slow.  I am using x64 gcc 4.7 and icc 11 compilers, sames results with -O2, -O3, -msse2/-xSSE2.

Did I make something wrong ?

With my best regards.

Moomba

#include <stdio.h>
#include <time.h>
#include<emmintrin.h>
#define n1 102
#define n2 102
#define niter 20000
double U0[n1][n2];
double U1[n1][n2];
double U2[n1][n2]; /* buffer*/
double *cU0[n1][n2];
/* simd */
__m128d vU0[n1][n2];
__m128d vU1[n1][n2];
__m128d vU2[n1][n2]; /* buffer*/
int i,j,t;
double Dx,Dy,Lx,Ly,InvDxDx,InvDyDy,Dt,alpha,totaltime,Stab,DtAlpha,DxDx,DyDy;
clock_t time0,time1;
FILE *f1;
int main()
{
/* ---- GENERAL ---- */
alpha = 0.4;
totaltime = 1.0;
Dt = totaltime/((niter-1)*1.0);
Lx = 1.0;
Ly = 1.0;
Dx = Lx/((n1-1)*1.0);
Dy = Ly/((n2-1)*1.0);
InvDxDx = 1.0/(Dx*Dx);
InvDyDy = 1.0/(Dy*Dy);
DxDx = Dx*Dx;
DyDy = Dy*Dy;
Stab = alpha*Dt*(InvDxDx+InvDyDy);
DtAlpha = Dt*alpha;
/* Stability if result <= 0.5 */
printf("Stability factor : %f n",Stab);
/* +----------------+ */
/* |     LOOP 0     | */
/* +----------------+ */
/* Init */
for( i = 0; i < n1; i++)
{
for( j = 0; j < n2; j++)
{
U0[i][j] = 0.0;
U1[i][j] = 0.0;
}
}
for( i = 0; i < n1; i++)
{
U0[i][0] = 1.0;
U1[i][0] = 1.0;
}
printf("Init OK n");
/* Core */
time0=clock();
for( t = 0; t < niter; t++)
{
/* even */
for( i = 1; i < n1-1; i++)
{
for( j = 1; j < n2-1; j++)
{
U1[i][j] = U0[i][j] + DtAlpha*( (U0[i+1][j]-2.0*U0[i][j]+U0[i-1][j])*InvDxDx + (U0[i][j+1]-2.0*U0[i][j]+U0[i][j-1])*InvDyDy);
}
}
/* odd */
for( i = 1; i < n1-1; i++)
{
for( j = 1; j < n2-1; j++)
{
U0[i][j] = U1[i][j] + DtAlpha*( (U1[i+1][j]-2.0*U1[i][j]+U1[i-1][j])*InvDxDx + (U1[i][j+1]-2.0*U1[i][j]+U1[i][j-1])*InvDyDy);
}
}
}
time1=clock();
printf("Loop 0, total time : %f n", (double) time1-time0);
f1 = fopen ("out0.dat", "wt");
for( i = 1; i < n1-1; i++)
{
for( j = 1; j < n2-1; j++)
{
fprintf (f1, "%dt%dt%fn", i, j, U0[i][j]);
}
}
/* +----------------+ */
/* |     LOOP 1     | */
/* +----------------+ */
/* Init */
for( i = 0; i < n1; i++)
{
for( j = 0; j < n2; j++)
{
U0[i][j] = 0.0;
U1[i][j] = 0.0;
}
}
for( i = 0; i < n1; i++)
{
U0[i][0] = 1.0;
U1[i][0] = 1.0;
}
printf("Init OK n");
/* Core */
time0=clock();
for( t = 0; t < niter; t++)
{
/* even */
for( i = 1; i < n1-1; i++)
{
for( j = 1; j < n2-1; j++)
{
/* U1[i][j] = U0[i][j] + Dt*alpha*( (U0[i+1][j]-2.0*U0[i][j]+U0[i-1][j])*InvDxDx + (U0[i][j+1]-2.0*U0[i][j]+U0[i][j-1])*InvDyDy); */
U1[i][j] = -2.0*U0[i][j];
U1[i][j] = U1[i][j] + U0[i+1][j];
U1[i][j] = U1[i][j] + U0[i-1][j];
U1[i][j] = U1[i][j] * InvDxDx;
U2[i][j] = -2.0*U0[i][j];
U2[i][j] = U2[i][j] + U0[i][j+1];
U2[i][j] = U2[i][j] + U0[i][j-1];
U2[i][j] = U2[i][j] * InvDyDy;
U1[i][j] = U1[i][j] + U2[i][j];
U1[i][j] = U1[i][j] * DtAlpha;
U1[i][j] = U1[i][j] + U0[i][j];
}
}
/* odd */
for( i = 1; i < n1-1; i++)
{
for( j = 1; j < n2-1; j++)
{
U0[i][j] = -2.0*U1[i][j];
U0[i][j] = U0[i][j] + U1[i+1][j];
U0[i][j] = U0[i][j] + U1[i-1][j];
U0[i][j] = U0[i][j] * InvDxDx;
U2[i][j] = -2.0*U1[i][j];
U2[i][j] = U2[i][j] + U1[i][j+1];
U2[i][j] = U2[i][j] + U1[i][j-1];
U2[i][j] = U2[i][j] * InvDyDy;
U0[i][j] = U0[i][j] + U2[i][j];
U0[i][j] = U0[i][j] * DtAlpha;
U0[i][j] = U0[i][j] + U1[i][j];
}
}
}
time1=clock();
printf("Loop 1, total time : %f n", (double) time1-time0);
/* End */
f1 = fopen ("out1.dat", "wt");
for( i = 1; i < n1-1; i++)
{
for( j = 1; j < n2-1; j++)
{
fprintf (f1, "%dt%dt%fn", i, j, U0[i][j]);
}
}
/* +----------------+ */
/* |     LOOP 3     | */
/* +----------------+ */
/* Init */
for( i = 0; i < n1; i++)
{
for( j = 0; j < n2; j++)
{
vU0[i][j] = _mm_set1_pd(0.0);
vU1[i][j] = _mm_set1_pd(0.0);
}
}
for( i = 0; i < n1; i++)
{
vU0[i][0] = _mm_set1_pd(1.0);
vU1[i][0] = _mm_set1_pd(1.0);
}
printf("Init OK n");
/* Core */
time0=clock();
for( t = 0; t < niter; t++)
{
/* even */
for( i = 1; i < n1-1; i++)
{
for( j = 1; j < n2-1; j+=2)
{
/* U1[i][j] = U0[i][j] + Dt*alpha*( (U0[i+1][j]-2.0*U0[i][j]+U0[i-1][j])*InvDxDx + (U0[i][j+1]-2.0*U0[i][j]+U0[i][j-1])*InvDyDy); */
__m128d va = _mm_set1_pd(-2.0);            /* U1[i][j] = -2.0*U0[i][j]; */
vU1[i][j] = _mm_mul_pd(va,vU0[i][j]);
vU1[i][j] = _mm_add_pd(vU1[i][j],vU0[i+1][j]);    /* U1[i][j] = U1[i][j] + U0[i+1][j]; */
vU1[i][j] = _mm_add_pd(vU1[i][j],vU0[i-1][j]);    /* U1[i][j] = U1[i][j] + U0[i-1][j]; */
__m128d vb = _mm_set1_pd(InvDxDx);            /* U1[i][j] = U1[i][j] * InvDxDx; */
vU1[i][j] = _mm_mul_pd(vb,vU1[i][j]);
vU2[i][j] = _mm_mul_pd(va,vU0[i][j]);            /* U2[i][j] = -2.0*U0[i][j]; */

vU2[i][j] = _mm_add_pd(vU2[i][j],vU0[i+1][j]);    /* U2[i][j] = U2[i][j] + U0[i+1][j]; */
vU2[i][j] = _mm_add_pd(vU2[i][j],vU0[i-1][j]);    /* U2[i][j] = U2[i][j] + U0[i-1][j]; */

__m128d vc = _mm_set1_pd(InvDyDy);            /* U2[i][j] = U2[i][j] * InvDyDy; */
vU2[i][j] = _mm_mul_pd(vc,vU2[i][j]);

vU1[i][j] = _mm_add_pd(vU1[i][j],vU2[i][j]);        /* U1[i][j] = U1[i][j] + U2[i][j]; */

__m128d vd = _mm_set1_pd(DtAlpha);            /* U1[i][j] = U1[i][j] * DtAlpha; */
vU1[i][j] = _mm_mul_pd(vd,vU1[i][j]);

vU1[i][j] = _mm_add_pd(vU1[i][j],vU0[i][j]);        /* U1[i][j] = U1[i][j] + U0[i][j]; */

}
}
/* odd */
for( i = 1; i < n1-1; i++)
{
for( j = 1; j < n2-1; j++)
{
__m128d va = _mm_set1_pd(-2.0);            /* U0[i][j] = -2.0*U1[i][j]; */
vU0[i][j] = _mm_mul_pd(va,vU1[i][j]);
vU0[i][j] = _mm_add_pd(vU0[i][j],vU1[i+1][j]);    /* U0[i][j] = U0[i][j] + U1[i+1][j]; */
vU0[i][j] = _mm_add_pd(vU0[i][j],vU1[i-1][j]);    /* U0[i][j] = U0[i][j] + U1[i-1][j]; */
__m128d vb = _mm_set1_pd(InvDxDx);            /* U0[i][j] = U0[i][j] * InvDxDx; */
vU0[i][j] = _mm_mul_pd(vb,vU0[i][j]);
vU2[i][j] = _mm_mul_pd(va,vU1[i][j]);            /* U2[i][j] = -2.0*U1[i][j]; */

vU2[i][j] = _mm_add_pd(vU2[i][j],vU1[i+1][j]);    /* U2[i][j] = U2[i][j] + U1[i+1][j]; */
vU2[i][j] = _mm_add_pd(vU2[i][j],vU1[i-1][j]);    /* U2[i][j] = U2[i][j] + U1[i-1][j]; */

__m128d vc = _mm_set1_pd(InvDyDy);            /* U2[i][j] = U2[i][j] * InvDyDy; */
vU2[i][j] = _mm_mul_pd(vc,vU2[i][j]);

vU0[i][j] = _mm_add_pd(vU0[i][j],vU2[i][j]);        /* U0[i][j] = U0[i][j] + U2[i][j]; */

__m128d vd = _mm_set1_pd(DtAlpha);            /* U0[i][j] = U0[i][j] * DtAlpha; */
vU0[i][j] = _mm_mul_pd(vd,vU0[i][j]);

vU0[i][j] = _mm_add_pd(vU0[i][j],vU1[i][j]);        /* U0[i][j] = U0[i][j] + U1[i][j]; */
}
}
}
time1=clock();
printf("Loop 3, total time : %f n", (double) time1-time0);
/* End */
f1 = fopen ("out3.dat", "wt");
for( i = 1; i < n1-1; i++)
{
for( j = 1; j < n2-1; j+=2)
{
fprintf (f1, "%dt%dt%fn", i, j, vU0[i][j]);
}
}
}

29 posts / 0 new
For more complete information about compiler optimizations, see our Optimization Notice.

>>...
>>double U0[n1][n2];
>>double U1[n1][n2];
>>double U2[n1][n2]; /* buffer*/
>>double *cU0[n1][n2];
>>/* simd */
>>__m128d vU0[n1][n2];
>>__m128d vU1[n1][n2];
>>__m128d vU2[n1][n2]; /* buffer*/
>>...

Could you try to declare all these buffers with a '__declspec( align( 16 ) )' specificator?

I changed to __attribute__ ((aligned(16))) because my gcc couldn't use the __declspec( align( 16 ) ).

__attribute__ ((aligned(16))) double U0[n1][n2];
__attribute__ ((aligned(16))) double U1[n1][n2];
__attribute__ ((aligned(16))) double U2[n1][n2];
__attribute__ ((aligned(16))) __m128d vU0[n1][n2];
__attribute__ ((aligned(16))) __m128d vU1[n1][n2];
__attribute__ ((aligned(16))) __m128d vU2[n1][n2];

It has no effects, times are the sames. I tried on a Core 2 and on an Xeon i7, same effects.

>>...the second loop is slightly faster than the first one, and the optimize loop is really slow...

Actually, I would expect a different situation regarding performance:

Fastest - 'optimize loop' ( SSE2 )
Then - 'first one' ( Reference )
Followed by - 'second one' ( Decomposition )

I can't promise for 100% but I'll try to look at your case tomorrow on a Windows XP ( 32-bit ) and VS 2005 ( MS & Intel C++ compilers ).

I would really appreciate if you could test it. This very simple code simulate the same kind of loop we face in CFD computations, and our codes can stay weeks running on 1024 Westmer CPUs. Reducing the cost of this loop would be very effective in terms of time and energy consumption. On top of that, with the new AVX, I expect results to be two times better.

I will investigate your test-cases #1 and #3 and won't do anything with the test case #2.

>>...I would really appreciate if you could test it. This very simple code simulate the same kind of loop we face in CFD computations...

What is CFD?

It happens again and again. Guys, when you're using some abbreviation(s) don't assume that everybody knows what it means! I think it is a matter of respect to explain in a short form what some abbreviation means.

>>>What is CFD?>>>

IIRC "CFD" stands for Computational Fluid Dynamics.

Yes, it's for Computational Fluid Dynamics. My apologies.
We decompose spatial domain in millions of points, and use Navier Stokes equations to resolve fluid advancement in time. Considering that at each iteration, time step is around 1e-8 seconds of real simulated time, and that an iteration takes something like 15s to be done, you can imagine how important it is to optimize calculations. (after 1 week you have 6ms of simulated fluid in average)

The example program below is resolving heat diffusion in fluids, and is a good and simple representation of what is done at larger scales in our codes.

>>Yes, it's for Computational Fluid Dynamics. My apologies.
>>We decompose spatial domain in millions of points, and use Navier Stokes equations to resolve fluid advancement in time. Considering
>>that at each iteration, time step is around 1e-8 seconds of real simulated time, and that an iteration takes something like 15s to be done,
>>you can imagine how important it is to optimize calculations. (after 1 week you have 6ms of simulated fluid in average).

Thank you and that looks Very Intersting. So, I've spent some time today and improved the performance of Test-Case #1 ( LOOP 0 ) for about 12% - 15%. I will upload optimized source codes some time tomorrow since I need to do a couple of more verifications related to a precision loss ( there 3 cases in total and I'll provide details ).

Best regards,
Sergey

>>...after 1 week you have 6ms of simulated fluid in average...

What computers do you use? Could you provide some details about a generation of CPU and model? Is it Intel, AMD, or something else?

>>>We decompose spatial domain in millions of points, and use Navier Stokes equations to resolve fluid advancement in time>>>
What numerical approximation is used to solve Navier-Stokes equation?

Thank you.

>> What computers do you use? Could you provide some details about a generation of CPU and model? Is it Intel, AMD, or something else?

In fact, a lot of different systems. I ran the code on the following type of architecture: AMD Opteron (don't remember the type), Intel Xeon with 775 socket (JADE 1 at CINES - France), Intel Xeon Westmere with 1366 socket (ANTARES at CRIHAN - France) and I used PowerPC processors with 4.7 Ghz, 32mo L3 cache (ANTARES at IDRISS - France). I used from 16 to 4096 CPUs with MPI library [http://fr.wikipedia.org/wiki/Message_Passing_Interface].
However, most of systems tend to be intel only now, most of the time Westmere and maybe 2011 socket PCUs now because of the tri and quad memory channels (more important than CPU frequency in our cases).
I also plan to use Nvidia CUDA Fermi to solve Chemistry because when used in simulations, it calls a lot of mathematical functions and simple matrix operations.

>> What numerical approximation is used to solve Navier-Stokes equation?
I use Low Mach approach: I suppose that sound propagates at infinite speed. So I cut spatial domain on a small grid with respects to Kolmogorov Scale [http://en.wikipedia.org/wiki/Kolmogorov_microscales] which is an important part of Direct Numerical Simulations. The order of spatial step is around 10 µm.
Then, Navier Stokes equations are solved using discretization [http://en.wikipedia.org/wiki/Finite_difference] and finites difference (6 order) for space and Runge Kutta time advancement. (3 iterations for one time step).
To conclude, I need to solve a Poisson equations for pressure resolution which is implicit and difficult to solve on memory distributed CPU. I use a BICGSTAB approach combined with a multigrid preconditioning system to solve respectively high and low frequencies. Our code to solve Poisson converges in 15 Iterations in average, independently of the number of CPU used, which is a good achievement.

A good example of results can be found here: https://www.youtube.com/results?search_query=direct+numerical+simulation

>>>I also plan to use Nvidia CUDA Fermi to solve Chemistry because when used in simulations, it calls a lot of mathematical functions and simple matrix operations.>>>
CUDA will be very helpful in your case.Do chemistry calculation could be easily vectorized like a video processing?

>>>To conclude, I need to solve a Poisson equations for pressure resolution which is implicit and difficult to solve on memory distributed CPU. I use a BICGSTAB approach combined with a multigrid preconditioning system to solve respectively high and low frequencies. Our code to solve Poisson converges in 15 Iterations in average, independently of the number of CPU used, which is a good achievement.>>>
Very interesting info,but it is too advanced for me.I have only written 1D numerical integrators.By reading your code it could be implemented relatively easily with the help of intrinsics.

>>...In fact, a lot of different systems. I ran the code...

Is it an Open Source project?

- Review all equations because in some cases they could be normalized in order to reduce number of multiplications ( it is related to variables InvDxDx, InvDyDy, DtAlpha )

- I evaluated performance of 4 versions of the Test-Case #1 ( LOOP 0 ) and numbers are as follows:

[ With Microsoft C/C++ compiler / Release configuration / All optimizations are Disabled ]
--- Version 1 ( Original ) = Base
--- Version 2 ( Combined / Process priority Normal ) = Improvement by ~8.42%
--- Version 3 ( Unrolled Loops 2-in-1 / Process priority Normal ) = Improvement by ~9.48%
--- Version 4 ( Unrolled Loops 3-in-1 / Process priority Normal ) = Improvement by ~10.95%
--- Version 4 ( Unrolled Loops 3-in-1 / Process priority Real-time ) = Improvement by ~12.09%

[ With MinGW C/C++ compiler / Release configuration / All optimizations are Disabled ]
--- Version 4 ( Unrolled Loops 3-in-1 / Process priority Real-time ) = Improvement by ~16.02%

- I was very impressed with performance results when MinGW C/C++ compiler was used

- I tried to use 'float' data type instead of 'double' data type and I don't recommend to use 'floats' ( performance is almost the same however there is a significant precision loss )

- If all calculations are done on a dedicated computer I recommend to use a priority boost to Real-time or High

- I don't expect a significant improvement in performance in a case when C/C++ compiler optimizations are enabled and I would consider instead a better hand-optimization with SSE or AVX instructions

- Review a Test-Case #3 ( your SSE codes ) because you've declared a couple of 128-bit variables on the stack and it affects performance ( some memory for these variables will be allocated and de-allocated more than ~208,080,000 times )

Modified sources attached.

Attachments:

AttachmentSize
5.29 KB

Here are some details regarding a precision loss:
...
61 89 Ref=0.039838 Optimized=0.039839
...
75 34 Ref=0.326130 Optimized=0.326131
...
86 57 Ref=0.093223 Optimized=0.093224
...
There are only three cases and all of them related to limitations of IEE 754 Standard. You need to decide if this is acceptable for your R&D work. I also attached a zip-file with outputs for all versions of codes.

Attachments:

AttachmentSize
406.72 KB

>>... I also attached a zip-file with outputs for all versions of codes...

I used Microsoft's WinDiff utility to compare results.

I've done a quick test with a 20-bit precision Fixed Floating Point ( FFP ) type instead of a 53-bit precision Double-Precision floating point 'double' type. I could say that was a right decision to use 'double' data type because there was a significant loss in precision of results if FFP types are used.

A bit late, my apologies. I was on travel and I couldn't get a secured internet connection. I am downloading your code and I will take a look at it tomorrow.

>>> CUDA will be very helpful in your case.Do chemistry calculation could be easily vectorized like a video processing?
Unfortunately no, and that is the challenge. Another team is working on it, and they face difficulties with the Fortran 77 code that is using a lot of “go to” instructructions.

>>> Very interesting info,but it is too advanced for me.I have only written 1D numerical integrators.By reading your code it could be implemented relatively easily with the help of intrinsics.
Yes. The equations are relatively complex, but the main core is always the same : Derivative calculations, which is the same than image processing.

>>> Is it an Open Source project?
Not know, but it is on the way to be, yes. We are currently rewriting some parts of the code to make it more “usable”. Then, we will release the sources, which could be used in OpenFoam For example (an open source fluid mechanic solver).

>>> - Review all equations because in some cases they could be normalized in order to reduce number of multiplications ( it is related to variables InvDxDx, InvDyDy, DtAlpha )
I gave you a simple loop. In normal time, Dx and Dy are not constant, so the loop can be more complicated. You need to calculate each terms at a time, but yes, I think we could comment the main loop and rewrite it in a more optimized way.

>>> - I was very impressed with performance results when MinGW C/C++ compiler was used
Yes, they made impressive improvements lately.

>>> - Review a Test-Case #3 ( your SSE codes ) because you've declared a couple of 128-bit variables on the stack and it affects performance ( some memory for these variables will be allocated and de-allocated more than ~208,080,000 times )
I will try this and report results tomorrow.

>>> >>... I also attached a zip-file with outputs for all versions of codes...
I used Microsoft's WinDiff utility to compare results.
>>> I've done a quick test with a 20-bit precision Fixed Floating Point ( FFP ) type instead of a 53-bit precision Double-Precision floating point 'double' type. I could say that was a right decision to use 'double' data type because there was a significant loss in precision of results if FFP types are used.
Thank you. I don’t know this program, so I will take a look.
In fact, because the solver can perform this loop more than 300 times per iterations, and considering that a run can be more than 100,000 iterations, precision is extremely important. For example, we tried to calculate using simple reals, and the results diverged after only 12 iterations. And if you add chemistry calculations, due to exponential operations, it explodes immediately. On top of that, the BICGSTAB solver converged at 10^-15 of precision, which cannot be achieved with simple precision.

I will report tomorrow the results on my Core i7 Xeon.

Thanks for the feedback!

>>...I gave you a simple loop. In normal time, Dx and Dy are not constant, so the loop can be more complicated...

I suspected that.

>>...In fact, because the solver can perform this loop more than 300 times per iterations, and considering that a run can be more than
>>100,000 iterations, precision is extremely important...

I don't know if you tried to use 'long double' type but I wouldn't recommend even to try it. I've done lots of tests and it doesn't help in my case. Test is, multiplication of very big matricies.

>>>>
>>Unfortunately no, and that is the challenge.

I agree and it will require a complete re-design, re-testing of some subsystems. It makes sense if CUDA will give a performance improvement in 50x or 100x.

>>>I agree and it will require a complete re-design, re-testing of some subsystems. It makes sense if CUDA will give a performance improvement in 50x or 100x.>>>

I have forgotten to add only if the calculations could be easily vectorized and be independent from each other just like pixels.

@Seregey

Hi Iliya,

>>...if the calculations could be easily vectorized and be independent from each other just like pixels...

If you review the test-case 'LOOP 0' you will see that there are dependencies between U0 and U1 arrays.

>>...Please look at my sine functions thread. I'm posting some code test-cases...

I'll take a look soon. Thanks.

>>>If you review the test-case 'LOOP 0' you will see that there are dependencies between U0 and U1 arrays.>>>
Yes of course.
A few months ago I had a very interesting "conversation" with the user "bronxz" the main subject of the discussion was advantage which posses a CPU over GPU in graphics rendering.We agreed that for complex logic which involves extensive branching , managing memory and caches a CPU has an advantage.GPU will be very useful when nicely vectorized data without complex interdependencies will be passed to it for processing.
P.s
After reviewing my sine function thread sadly I cannot find those posts(everything was lost).

Just before testing more, as I thought your loops are not the sames as the original one. U1 needs U0 to be fully calculated before being processed. And vice versa. Look at the U0[i][j+1] for example. I made the same mistake when I first tried to optimise the loop. But because it is a convergence calcul, results will be the same at the end. However, the non correct loops will take a larger time to converge.

I made the code running on Gcc 4.7 on an i7 860. I will now test your loops and try to correct my SSE instructions. Results in a few hours.

After some tests, the first loop is all the time the fastest when using GCC O3 level optimisations. I think it automaticaly unroll the loops because they are simple in this case.

I need to take a look at icc compiler because some time it makes excellents optimisations.

I also tried to make the Simd code, but it doesn't work, the executable stop running.

double *cU0[n1][n2];

double *cU1[n1][n2];

/* simd */

__attribute__ ((aligned(16))) __m128d va;

__attribute__ ((aligned(16))) __m128d vb;

__attribute__ ((aligned(16))) __m128d vc;

__attribute__ ((aligned(16))) __m128d vd;

__attribute__ ((aligned(16))) __m128d vmmx00;

__attribute__ ((aligned(16))) __m128d vmmx01;

__attribute__ ((aligned(16))) __m128d vmmx02;

__attribute__ ((aligned(16))) __m128d vmmx03;

__attribute__ ((aligned(16))) __m128d vmmx04;

__attribute__ ((aligned(16))) __m128d vmmx10;

__attribute__ ((aligned(16))) __m128d vmmx20;
[...]
va = _mm_set1_pd(-2.0);

vb = _mm_set1_pd(InvDxDx);

vc = _mm_set1_pd(InvDyDy);

vd = _mm_set1_pd(DtAlpha);
for( t = 0; t < niter; t++)

{

/* even */

for( i = 1; i < n1-1; i++)

{

for( j = 1; j < n2-1; j+=2)

{
// Need five variables : [i][j],[i][j+1],[i][j-1],[i+1][j],[i-1][j]

// respectively : vxmm0,vxmm1,vxmm2,vxmm3,vxmm4

vmmx10 = _mm_mul_pd(va,vmmx00);	// U1[i][j] = -2.0*U0[i][j];

vmmx10 = _mm_add_pd(vmmx10,vmmx03);	// U1[i][j] = U1[i][j] + U0[i+1][j];

vmmx10 = _mm_add_pd(vmmx10,vmmx04);	// U1[i][j] = U1[i][j] + U0[i-1][j];

vmmx10 = _mm_mul_pd(vb,vmmx10);	// U1[i][j] = U1[i][j] * InvDxDx;
vmmx20 = _mm_mul_pd(va,vmmx00);	// U2[i][j] = -2.0*U0[i][j];

vmmx20 = _mm_add_pd(vmmx20,vmmx01);	// U2[i][j] = U2[i][j] + U0[i][j+1];

vmmx20 = _mm_add_pd(vmmx20,vmmx02);	// U2[i][j] = U2[i][j] + U0[i][j-1];

vmmx20 = _mm_mul_pd(vc,vmmx20);	// U2[i][j] = U2[i][j] * InvDyDy;
vmmx10 = _mm_add_pd(vmmx10,vmmx20);	// U1[i][j] = U1[i][j] + U2[i][j];

vmmx10 = _mm_mul_pd(vd,vmmx10);	// U1[i][j] = U1[i][j] * DtAlpha;
vmmx10 = _mm_add_pd(vmmx10,vmmx00);	// U1[i][j] = U1[i][j] + U0[i][j];
_mm_store_pd(cU1[i][j],vmmx10);
[...]

Do you locate the error ? Because it seems correct to me and I don't have a good debugger here with me ...

I made a simplier program (one dimension). All data are aligned, and I need to use unaligned load (no problems with >= i7).

But ! The non SSE loop is 2 times faster. I don't understand...

#include

#include

#include

//#include

#define n1 1004

#define niter 200000
int i,j,t;
double U0[n1] __attribute__ ((aligned(16)));

double U1[n1] __attribute__ ((aligned(16)));

double Dx,Dy,Lx,Ly,InvDxDx,Dt,alpha,totaltime,Stab,DtAlpha,DxDx;
__m128d vmmx00;

__m128d vmmx01;

__m128d vmmx02;

__m128d vmmx10;
__m128d va;

__m128d vb;

__m128d vc;

__m128d vd;
clock_t time0,time1;

FILE *f1;

int main()

{

/* ---- GENERAL ---- */

alpha = 0.4;

totaltime = 1.0/100.0;

Dt = totaltime/((niter-1)*1.0);

Lx = 1.0;

Dx = Lx/((n1-1)*1.0);

InvDxDx = 1.0/(Dx*Dx);

DxDx = Dx*Dx;

Stab = alpha*Dt*(InvDxDx);

DtAlpha = Dt*alpha;

/* Stability if result <= 0.5 */

printf("Stability factor : %f n",Stab);
for( i = 0; i < n1; i++){U0[i] = 0.0;}

U0[1] = 1.0;

U0[2] = 1.0;

U0[3] = 1.0;

U0[n1-2] = 2.0;
//    for ( i = 0; i < n1; i++) {

//       for ( j = i + 1; j < n2; j++) {

//          std::swap(U0[i][j], U0[j][i]);

//     }

//}
va = _mm_set1_pd(-2.0);

vb = _mm_set1_pd(InvDxDx);

vd = _mm_set1_pd(DtAlpha);
time0=clock();
for( t = 0; t < niter; t++)

{
for( i = 2; i < n1-2; i+=2)

{

//printf("%d  %d  n",i,j);

//fflush(stdout);

vmmx10 = _mm_mul_pd(va,vmmx00);	// U1[i][j] = -2.0*U0[i][j];

vmmx10 = _mm_add_pd(vmmx10,vmmx01);	// U1[i][j] = U1[i][j] + U0[i+1][j];

vmmx10 = _mm_add_pd(vmmx10,vmmx02);	// U1[i][j] = U1[i][j] + U0[i-1][j];

vmmx10 = _mm_mul_pd(vb,vmmx10);	// U1[i][j] = U1[i][j] * InvDxDx;
vmmx10 = _mm_mul_pd(vd,vmmx10);	// U1[i][j] = U1[i][j] * DtAlpha;

vmmx10 = _mm_add_pd(vmmx10,vmmx00);	// U1[i][j] = U1[i][j] + U0[i][j];
_mm_store_pd(&U1[i],vmmx10);
// U1[i][j] = U0[i][j] + DtAlpha*( (U0[i+1][j]-2.0*U0[i][j]+U0[i-1][j])*InvDxDx

}
for( i = 2; i < n1-2; i+=2)

{

//printf("%d  %d  n",i,j);

//fflush(stdout);

vmmx10 = _mm_mul_pd(va,vmmx00);	// U0[i][j] = -2.0*U1[i][j];

vmmx10 = _mm_add_pd(vmmx10,vmmx01);	// U0[i][j] = U0[i][j] + U1[i+1][j];

vmmx10 = _mm_add_pd(vmmx10,vmmx02);	// U0[i][j] = U0[i][j] + U1[i-1][j];

vmmx10 = _mm_mul_pd(vb,vmmx10);	// U0[i][j] = U0[i][j] * InvDxDx;
vmmx10 = _mm_mul_pd(vd,vmmx10);	// U0[i][j] = U0[i][j] * DtAlpha;

vmmx10 = _mm_add_pd(vmmx10,vmmx00);	// U0[i][j] = U0[i][j] + U1[i][j];
_mm_store_pd(&U0[i],vmmx10);
// U1[i][j] = U0[i][j] + DtAlpha*( (U0[i+1][j]-2.0*U0[i][j]+U0[i-1][j])*InvDxDx

}
}
time1=clock();

printf("Loop 0, total time : %f n", (double) time1-time0);

f1 = fopen ("out0.dat", "wt");

for( i = 1; i < n1-1; i++)

{

fprintf (f1, "%dt%fn", i, U0[i]);

}
// REF
for( i = 0; i < n1; i++){U0[i] = 0.0;}

U0[1] = 1.0;

U0[2] = 1.0;

U0[3] = 1.0;

U0[n1-2] = 2.0;
time0=clock();
for( t = 0; t < niter; t++)

{
for( i = 2; i < n1-2; i++)

{

U1[i] = U0[i] + DtAlpha* (U0[i+1]-2.0*U0[i]+U0[i-1])*InvDxDx;

}
for( i = 2; i < n1-2; i++)

{

U0[i] = U1[i] + DtAlpha* (U1[i+1]-2.0*U1[i]+U1[i-1])*InvDxDx;

}
}
time1=clock();

printf("Loop 0, total time : %f n", (double) time1-time0);

f1 = fopen ("outref.dat", "wt");

for( i = 1; i < n1-1; i++)

{

fprintf (f1, "%dt%fn", i, U0[i]);

}
}

I really don't understand where I made a mistake...Does someone has any Idea ?