My code conversion to avx from SSE with VS2010 Sp1 only gives a 15% speedup
for the simple code fragments shown below. SSE code runs 20seconds, AVX code runs 17 seconds.
Have I missed something obvious? I expected a speedup of maybe 50% at least.
Is this a realistic expectation? I expect my AVX code to run in about 11 to 12 seconds. Am I expecting too much ?
SSE CODE:
VS2010 /arch:sse
#include
__m128 *oh0;// same for all sse,single float for other variables defined by pointer below
pointer = (float **)_aligned_malloc(imax * sizeof(float *),32);
for (i = 1; i < ie; i++) {
for (j = 0; j < je; j+=4) {
oh0=(__m128 *)&ey[i][j];
oh1=(__m128 *)&caey[i][j];
oh2=(__m128 *)&cbey[i][j];
oh3=(__m128 *)&hz[i-1][j];
oh4=(__m128 *)&hz[i][j];
m1 = _mm_mul_ps(*oh0,*oh1);
m2 = _mm_sub_ps(*oh3,*oh4);
m3 = _mm_mul_ps(*oh2,m2);
m4 = _mm_add_ps(m1,m3);
_mm_store_ps(&ey[i][j],m4);
}
}
AVX CODE:
VS2010 SP1 /arch:avx
#include
__m256 *aoh0;// all avx defined this way. other variables as single float allocation in 2d arrays below
pointer = (float **)_aligned_malloc(imax * sizeof(float *),32);
for (i = 1; i < ie; i++) {
for (j = 0; j < je; j+=8) {
aoh0=(__m256 *)&ey[i][j];
aoh1=(__m256 *)&caey[i][j];
aoh2=(__m256 *)&cbey[i][j];
aoh3=(__m256 *)&hz[i-1][j];
aoh4=(__m256 *)&hz[i][j];
am1 = _mm256_mul_ps(*aoh0,*aoh1);
am2 = _mm256_sub_ps(*aoh3,*aoh4);
am3 = _mm256_mul_ps(*aoh2,am2);
am4 = _mm256_add_ps(am1,am3);
_mm256_store_ps(&ey[i][j],am4);
}
}


