Numerical computation often involves the repetition of the same calculations with different variable values. SSE SIMD technology is perfectly adapted to this kind of calculation. My dilemma is inline assembly or use of intrinsics? To get a clearer view, I applied both methods to a simple example (unrealistic, of course), the computation of e^x by power series for two values of x. I used microsoft 2010 visual studio c++. Here is the inline assembly program:

#include

#include

#include

#include

using namespace std;

int main()

{

/***** computes e^x = 1 + x + x^2/2! + x^3/3! + ... *****/

__declspec(align(16)) double onevec[2], xvec[2], res[2];

xvec[0] = 0.5;

xvec[1] = 0.7;

onevec[0] = 1;

onevec[1] = 1;

__asm

{

mov eax,1; // counter initialization

movapd xmm0,onevec; // onevec = {1, 1} in xmm0

movapd xmm1,xvec; // xvec = {X1, x2} (2 values of the variable)

movapd xmm2,xmm0; // factorial initialization: factor = {1, 1} in xmm2

movapd xmm3,xmm1; // series linear term: {x1, x2} in xmm3

movapd xmm4,xmm0; // sum initialization: sum = {1, 1} in xmm4

addpd xmm4,xmm3; // 1st iteration: sum = {1,1} + {x1, x2}

itloop: inc eax; // next iteration

addpd xmm2,xmm0; // {n, n} -> {n+1, n+1}

mulpd xmm3,xmm1; // x^n -> x^(n+1) for both x1 & x2

divpd xmm3,xmm2; // x^(n+1)/n! -> x^(n+1)/(n+1)!

addpd xmm4,xmm3; // sum = sum + x^(n+1)/(n+1)! for both x1 & x2 ->

// iteration completed

cmp eax,6; // loop for eax = 2, 3, ... , 7

jle itloop;

movapd res,xmm4; // save result

}

cout << res[0] << " " << res[1] << endl;

return 0;

}

and its disassembly:

/***** computes e^x = 1 + x + x^2/2! + x^3/3! + ... *****/

__declspec(align(16)) double onevec[2], xvec[2], res[2];

xvec[0] = 0.5;

011414C0 fld qword ptr [__real@3fe0000000000000 (1147858h)]

011414C6 fstp qword ptr [xvec]

xvec[1] = 0.7;

011414C9 fld qword ptr [__real@3fe6666666666666 (1147848h)]

011414CF fstp qword ptr [ebp-38h]

onevec[0] = 1;

011414D2 fld1

011414D4 fstp qword ptr [onevec]

onevec[1] = 1;

011414D7 fld1

011414D9 fstp qword ptr [ebp-18h]

__asm

{

mov eax,1; // counter initialization

011414DC mov eax,1

movapd xmm0,onevec; // onevec = {1, 1} in xmm0

011414E1 movapd xmm0,xmmword ptr [onevec]

movapd xmm1,xvec; // xvec = {X1, x2} (2 values of the variable)

011414E6 movapd xmm1,xmmword ptr [xvec]

movapd xmm2,xmm0; // factorial initialization: factor = {1, 1} in xmm2

011414EB movapd xmm2,xmm0

movapd xmm3,xmm1; // series linear term: {x1, x2} in xmm3

011414EF movapd xmm3,xmm1

movapd xmm4,xmm0; // sum initialization: sum = {1, 1} in xmm4

011414F3 movapd xmm4,xmm0

addpd xmm4,xmm3; // 1st iteration: sum = {1,1} + {x1, x2}

011414F7 addpd xmm4,xmm3

itloop: inc eax; // next iteration

011414FB inc eax

addpd xmm2,xmm0; // {n, n} -> {n+1, n+1}

011414FC addpd xmm2,xmm0

mulpd xmm3,xmm1; // x^n -> x^(n+1) for both x1 & x2

01141500 mulpd xmm3,xmm1

divpd xmm3,xmm2; // x^(n+1)/n! -> x^(n+1)/(n+1)!

01141504 divpd xmm3,xmm2

addpd xmm4,xmm3; // sum = sum + x^(n+1)/(n+1)! for both x1 & x2 ->

01141508 addpd xmm4,xmm3

// iteration completed

cmp eax,6; // loop for eax = 2, 3, ... , 7

0114150C cmp eax,6

jle itloop;

0114150F jle itloop (11414FBh)

movapd res,xmm4; // save result

01141511 movapd xmmword ptr [res],xmm4

}

Of course, it strictly conforms to the inline assembly code! (how could it be different?).

Now, here is the intrinsics-based code as I have been able to write it down:

#include

#include

#include

#include

using namespace std;

int main()

{

/***** computes e^x = 1 + x + x^2/2! + x^3/3! + ... *****/

__declspec(align(16)) double res[2];

const int imax = 6;

double x1 = 0.5;

double x2 = 0.7;

double w = 1;

register int i = 1;

double acc = 0.001;

__m128d one = _mm_set1_pd(w); // one = {1, 1}

__m128d x = _mm_set_pd(x2, x1); // x = {X1, x2}

__m128d factor = one; // factorial initialization: factor = {1, 1}

__m128d sum = one; // sum initialization: sum = {1, 1}

__m128d term = x; // linear term: {x1, x2}

sum = _mm_add_pd(sum, term); // 1st iteration: sum = {1+x1, 1+x2}

do

{

i++;

factor = _mm_add_pd(factor, one); // {n, n} -> {n+1, n+1}

term = _mm_mul_pd(term, x); // x^n -> x^{n+1}

term = _mm_div_pd(term, factor); // x^{n+1}/n -> x^{n+1}/(n+1)

sum = _mm_add_pd(sum, term); // sum -> sum + x^{n+1}/(n+1)

} while(i<=imax);

_mm_store_pd(res, sum);

cout << res[0] << " " << res[1] << endl;

return 0;

}

and its disassembly:

/***** computes e^x = 1 + x + x^2/2! + x^3/3! + ... *****/

__declspec(align(16)) double res[2];

const int imax = 6;

002A14C0 mov dword ptr [imax],6

double x1 = 0.5;

002A14C7 fld qword ptr [__real@3fe0000000000000 (2A7868h)]

002A14CD fstp qword ptr [x1]

double x2 = 0.7;

002A14D0 fld qword ptr [__real@3fe6666666666666 (2A7858h)]

002A14D6 fstp qword ptr [x2]

double w = 1;

002A14D9 fld1

002A14DB fstp qword ptr [w]

register int i = 1;

002A14DE mov dword ptr [i],1

double acc = 0.001;

002A14E5 fld qword ptr [__real@3f50624dd2f1a9fc (2A7838h)]

002A14EB fstp qword ptr [acc]

__m128d one = _mm_set1_pd(w); // one = {1, 1}

002A14EE movsd xmm0,mmword ptr [w]

002A14F3 shufpd xmm0,xmm0,0

002A14F8 movapd xmmword ptr [ebp-2C0h],xmm0

002A1500 movapd xmm0,xmmword ptr [ebp-2C0h]

002A1508 movapd xmmword ptr [one],xmm0

__m128d x = _mm_set_pd(x2, x1); // x = {X1, x2}

002A1510 movsd xmm0,mmword ptr [x1]

002A1515 movsd xmm1,mmword ptr [x2]

002A151A unpcklpd xmm0,xmm1

002A151E movapd xmmword ptr [ebp-2A0h],xmm0

002A1526 movapd xmm0,xmmword ptr [ebp-2A0h]

002A152E movapd xmmword ptr [x],xmm0

__m128d factor = one; // factorial initialization: factor = {1, 1}

002A1536 movapd xmm0,xmmword ptr [one]

002A153E movapd xmmword ptr [factor],xmm0

__m128d sum = one; // sum initialization: sum = {1, 1}

002A1546 movapd xmm0,xmmword ptr [one]

002A154E movapd xmmword ptr [sum],xmm0

__m128d term = x; // linear term: {x1, x2}

002A1556 movapd xmm0,xmmword ptr [x]

002A155E movapd xmmword ptr [term],xmm0

sum = _mm_add_pd(sum, term); // 1st iteration: sum = {1+x1, 1+x2}

002A1566 movapd xmm0,xmmword ptr [term]

002A156E movapd xmm1,xmmword ptr [sum]

002A1576 addpd xmm1,xmm0

002A157A movapd xmmword ptr [ebp-280h],xmm1

002A1582 movapd xmm0,xmmword ptr [ebp-280h]

002A158A movapd xmmword ptr [sum],xmm0

do

{

i++;

002A1592 mov eax,dword ptr [i]

002A1595 add eax,1

002A1598 mov dword ptr [i],eax

factor = _mm_add_pd(factor, one); // {n, n} -> {n+1, n+1}

002A159B movapd xmm0,xmmword ptr [one]

002A15A3 movapd xmm1,xmmword ptr [factor]

002A15AB addpd xmm1,xmm0

002A15AF movapd xmmword ptr [ebp-260h],xmm1

002A15B7 movapd xmm0,xmmword ptr [ebp-260h]

002A15BF movapd xmmword ptr [factor],xmm0

term = _mm_mul_pd(term, x); // x^n -> x^{n+1}

002A15C7 movapd xmm0,xmmword ptr [x]

002A15CF movapd xmm1,xmmword ptr [term]

002A15D7 mulpd xmm1,xmm0

002A15DB movapd xmmword ptr [ebp-240h],xmm1

002A15E3 movapd xmm0,xmmword ptr [ebp-240h]

002A15EB movapd xmmword ptr [term],xmm0

term = _mm_div_pd(term, factor); // x^{n+1}/n -> x^{n+1}/(n+1)

002A15F3 movapd xmm0,xmmword ptr [factor]

002A15FB movapd xmm1,xmmword ptr [term]

002A1603 divpd xmm1,xmm0

002A1607 movapd xmmword ptr [ebp-220h],xmm1

002A160F movapd xmm0,xmmword ptr [ebp-220h]

002A1617 movapd xmmword ptr [term],xmm0

sum = _mm_add_pd(sum, term); // sum -> sum + x^{n+1}/(n+1)

002A161F movapd xmm0,xmmword ptr [term]

002A1627 movapd xmm1,xmmword ptr [sum]

002A162F addpd xmm1,xmm0

002A1633 movapd xmmword ptr [ebp-200h],xmm1

002A163B movapd xmm0,xmmword ptr [ebp-200h]

002A1643 movapd xmmword ptr [sum],xmm0

} while(i<=imax);

002A164B cmp dword ptr [i],6

002A164F jle main+102h (2A1592h)

_mm_store_pd(res, sum);

002A1655 movapd xmm0,xmmword ptr [sum]

002A165D movapd xmmword ptr [res],xmm0

It seems clear that, with the intrinsics instructions I used, the variables are saved in memory and reloaded into xmm registers at each step of the loop. This results in a loop of 29 machine codes, while the inline asm loop involves 7 op codes only.

I am everything but a programming expert (in fact, I am a physicist). Comments of experts would be appreciated. Are there intrinsics instructions that would avoid writing intermediate variable values into memory? Another question: visual studio c++ does not accept inline assembly in x64. How about Intel compilers?