Currently I am testing the parallel studio SSE code generation looking at assimbler code generated using
Assembly, Machine Code and Source (/FAcs) option.
{
float total;
int i;
__m128 num1, num2, num3, num4;
num4= _mm_setzero_ps(); //sets sum to zero
for(i=0; i {
num1 = _mm_loadu_ps(a+i); //loads unaligned array a into num1 num1= a[3] a[2] a[1] a[0]
num2 = _mm_loadu_ps(b+i); //loads unaligned array b into num2 num2= b[3] b[2] b[1] b[0]
num3 = _mm_mul_ps(num1, num2); //performs multiplication num3 = a[3]*b[3] a[2]*b[2] a[1]*b[1] a[0]*b[0]
num3 = _mm_hadd_ps(num3, num3); //performs horizontal addition
//num3= a[3]*b[3]+ a[2]*b[2] a[1]*b[1]+a[0]*b[0] a[3]*b[3]+ a[2]*b[2] a[1]*b[1]+a[0]*b[0]
num4 = _mm_add_ps(num4, num3); //performs vertical addition
}
num4= _mm_hadd_ps(num4, num4);
_mm_store_ss(&total,num4);
return total;
}
num1 = _mm_loadu_ps(a+i); //loads unaligned array a into num1 num1= a[3] a[2] a[1] a[0]
num2 = _mm_loadu_ps(b+i); //loads unaligned array b into num2 num2= b[3] b[2] b[1] b[0] I am expecting num1 and num2 to be SSE registers and executing this line: num3 = _mm_mul_ps(num1, num2); I am expecting num1 and num2 to be already initialized with a and b elements and
_mm_mul_ps to perform the multiplication. Analysing the assebler code generated I see different thing. ;;; num3 = _mm_mul_ps(num1, num2); //performs multiplication num3 = a[3]*b[3] a[2]*b[2] a[1]*b[1] a[0]*b[0]
a173c 0f 28 45 30 movaps xmm0, XMMWORD PTR [48+rbp] ;
a1740 0f 28 4d 40 movaps xmm1, XMMWORD PTR [64+rbp] ;
a1744 0f 59 c1 mulps xmm0, xmm1 ;
a1747 0f 29 45 50 movaps XMMWORD PTR [80+rbp], xmm0 ; As you see there are 2movaps to xmm0 and xmm1 meaning the values are loaded in registers just before the multiplication. looking at how nnum1 is initialized: ;;; num1 = _mm_loadu_ps(a+i); //loads unaligned array a into num1 num1= a[3] a[2] a[1] a[0] there a lot of conditionals and then this code: a16c1 48 63 85 10 01 00 00 movsxd rax, DWORD PTR [272+rbp] ;
a16c8 48 8b 95 30 01 00 00 mov rdx, QWORD PTR [304+rbp] ;
a16cf 0f 10 04 82 movups xmm0, XMMWORD PTR [rdx+rax*4] ;
a16d3 0f 29 45 30 movaps XMMWORD PTR [48+rbp], xmm0 ;
looking at num2 initialization:
a1726 48 63 85 10 01 00 00 movsxd rax, DWORD PTR [272+rbp] ;
a172d 48 8b 95 38 01 00 00 mov rdx, QWORD PTR [312+rbp] ;
a1734 0f 10 04 82 movups xmm0, XMMWORD PTR [rdx+rax*4] ;
a1738 0f 29 45 40 movaps XMMWORD PTR [64+rbp], xmm0 ;



