inline asm vs intrinsics

inline asm vs intrinsics

Numerical computation often involves the repetition of the same calculations with different variable values. SSE SIMD technology is perfectly adapted to this kind of calculation. My dilemma is inline assembly or use of intrinsics? To get a clearer view, I applied both methods to a simple example (unrealistic, of course), the computation of e^x by power series for two values of x. I used microsoft 2010 visual studio c++. Here is the inline assembly program:

#include
#include
#include
#include

using namespace std;

int main()
{
/***** computes e^x = 1 + x + x^2/2! + x^3/3! + ... *****/

__declspec(align(16)) double onevec[2], xvec[2], res[2];

xvec[0] = 0.5;
xvec[1] = 0.7;

onevec[0] = 1;
onevec[1] = 1;

__asm
{
mov eax,1; // counter initialization

movapd xmm0,onevec; // onevec = {1, 1} in xmm0
movapd xmm1,xvec; // xvec = {X1, x2} (2 values of the variable)
movapd xmm2,xmm0; // factorial initialization: factor = {1, 1} in xmm2
movapd xmm3,xmm1; // series linear term: {x1, x2} in xmm3
movapd xmm4,xmm0; // sum initialization: sum = {1, 1} in xmm4

addpd xmm4,xmm3; // 1st iteration: sum = {1,1} + {x1, x2}

itloop: inc eax; // next iteration

addpd xmm2,xmm0; // {n, n} -> {n+1, n+1}
mulpd xmm3,xmm1; // x^n -> x^(n+1) for both x1 & x2
divpd xmm3,xmm2; // x^(n+1)/n! -> x^(n+1)/(n+1)!
addpd xmm4,xmm3; // sum = sum + x^(n+1)/(n+1)! for both x1 & x2 ->
// iteration completed
cmp eax,6; // loop for eax = 2, 3, ... , 7
jle itloop;

movapd res,xmm4; // save result
}

cout << res[0] << " " << res[1] << endl;

return 0;
}

and its disassembly:

/***** computes e^x = 1 + x + x^2/2! + x^3/3! + ... *****/

__declspec(align(16)) double onevec[2], xvec[2], res[2];

xvec[0] = 0.5;
011414C0 fld qword ptr [__real@3fe0000000000000 (1147858h)]
011414C6 fstp qword ptr [xvec]
xvec[1] = 0.7;
011414C9 fld qword ptr [__real@3fe6666666666666 (1147848h)]
011414CF fstp qword ptr [ebp-38h]

onevec[0] = 1;
011414D2 fld1
011414D4 fstp qword ptr [onevec]
onevec[1] = 1;
011414D7 fld1
011414D9 fstp qword ptr [ebp-18h]

__asm
{
mov eax,1; // counter initialization
011414DC mov eax,1

movapd xmm0,onevec; // onevec = {1, 1} in xmm0
011414E1 movapd xmm0,xmmword ptr [onevec]
movapd xmm1,xvec; // xvec = {X1, x2} (2 values of the variable)
011414E6 movapd xmm1,xmmword ptr [xvec]
movapd xmm2,xmm0; // factorial initialization: factor = {1, 1} in xmm2
011414EB movapd xmm2,xmm0
movapd xmm3,xmm1; // series linear term: {x1, x2} in xmm3
011414EF movapd xmm3,xmm1
movapd xmm4,xmm0; // sum initialization: sum = {1, 1} in xmm4
011414F3 movapd xmm4,xmm0

addpd xmm4,xmm3; // 1st iteration: sum = {1,1} + {x1, x2}
011414F7 addpd xmm4,xmm3

itloop: inc eax; // next iteration
011414FB inc eax

addpd xmm2,xmm0; // {n, n} -> {n+1, n+1}
011414FC addpd xmm2,xmm0
mulpd xmm3,xmm1; // x^n -> x^(n+1) for both x1 & x2
01141500 mulpd xmm3,xmm1
divpd xmm3,xmm2; // x^(n+1)/n! -> x^(n+1)/(n+1)!
01141504 divpd xmm3,xmm2
addpd xmm4,xmm3; // sum = sum + x^(n+1)/(n+1)! for both x1 & x2 ->
01141508 addpd xmm4,xmm3
// iteration completed
cmp eax,6; // loop for eax = 2, 3, ... , 7
0114150C cmp eax,6
jle itloop;
0114150F jle itloop (11414FBh)

movapd res,xmm4; // save result
01141511 movapd xmmword ptr [res],xmm4
}

Of course, it strictly conforms to the inline assembly code! (how could it be different?).

Now, here is the intrinsics-based code as I have been able to write it down:

#include
#include
#include
#include

using namespace std;

int main()
{
/***** computes e^x = 1 + x + x^2/2! + x^3/3! + ... *****/

__declspec(align(16)) double res[2];
const int imax = 6;

double x1 = 0.5;
double x2 = 0.7;

double w = 1;
register int i = 1;
double acc = 0.001;

__m128d one = _mm_set1_pd(w); // one = {1, 1}
__m128d x = _mm_set_pd(x2, x1); // x = {X1, x2}
__m128d factor = one; // factorial initialization: factor = {1, 1}
__m128d sum = one; // sum initialization: sum = {1, 1}
__m128d term = x; // linear term: {x1, x2}

sum = _mm_add_pd(sum, term); // 1st iteration: sum = {1+x1, 1+x2}

do
{
i++;
factor = _mm_add_pd(factor, one); // {n, n} -> {n+1, n+1}
term = _mm_mul_pd(term, x); // x^n -> x^{n+1}
term = _mm_div_pd(term, factor); // x^{n+1}/n -> x^{n+1}/(n+1)
sum = _mm_add_pd(sum, term); // sum -> sum + x^{n+1}/(n+1)
} while(i<=imax);

_mm_store_pd(res, sum);
cout << res[0] << " " << res[1] << endl;

return 0;
}

and its disassembly:

/***** computes e^x = 1 + x + x^2/2! + x^3/3! + ... *****/

__declspec(align(16)) double res[2];

const int imax = 6;
002A14C0 mov dword ptr [imax],6

double x1 = 0.5;
002A14C7 fld qword ptr [__real@3fe0000000000000 (2A7868h)]
002A14CD fstp qword ptr [x1]
double x2 = 0.7;
002A14D0 fld qword ptr [__real@3fe6666666666666 (2A7858h)]
002A14D6 fstp qword ptr [x2]

double w = 1;
002A14D9 fld1
002A14DB fstp qword ptr [w]
register int i = 1;
002A14DE mov dword ptr [i],1
double acc = 0.001;
002A14E5 fld qword ptr [__real@3f50624dd2f1a9fc (2A7838h)]
002A14EB fstp qword ptr [acc]

__m128d one = _mm_set1_pd(w); // one = {1, 1}
002A14EE movsd xmm0,mmword ptr [w]
002A14F3 shufpd xmm0,xmm0,0
002A14F8 movapd xmmword ptr [ebp-2C0h],xmm0
002A1500 movapd xmm0,xmmword ptr [ebp-2C0h]
002A1508 movapd xmmword ptr [one],xmm0
__m128d x = _mm_set_pd(x2, x1); // x = {X1, x2}
002A1510 movsd xmm0,mmword ptr [x1]
002A1515 movsd xmm1,mmword ptr [x2]
002A151A unpcklpd xmm0,xmm1
002A151E movapd xmmword ptr [ebp-2A0h],xmm0
002A1526 movapd xmm0,xmmword ptr [ebp-2A0h]
002A152E movapd xmmword ptr [x],xmm0
__m128d factor = one; // factorial initialization: factor = {1, 1}
002A1536 movapd xmm0,xmmword ptr [one]
002A153E movapd xmmword ptr [factor],xmm0
__m128d sum = one; // sum initialization: sum = {1, 1}
002A1546 movapd xmm0,xmmword ptr [one]
002A154E movapd xmmword ptr [sum],xmm0
__m128d term = x; // linear term: {x1, x2}
002A1556 movapd xmm0,xmmword ptr [x]
002A155E movapd xmmword ptr [term],xmm0

sum = _mm_add_pd(sum, term); // 1st iteration: sum = {1+x1, 1+x2}
002A1566 movapd xmm0,xmmword ptr [term]
002A156E movapd xmm1,xmmword ptr [sum]
002A1576 addpd xmm1,xmm0
002A157A movapd xmmword ptr [ebp-280h],xmm1
002A1582 movapd xmm0,xmmword ptr [ebp-280h]
002A158A movapd xmmword ptr [sum],xmm0

do
{
i++;
002A1592 mov eax,dword ptr [i]
002A1595 add eax,1
002A1598 mov dword ptr [i],eax
factor = _mm_add_pd(factor, one); // {n, n} -> {n+1, n+1}
002A159B movapd xmm0,xmmword ptr [one]
002A15A3 movapd xmm1,xmmword ptr [factor]
002A15AB addpd xmm1,xmm0
002A15AF movapd xmmword ptr [ebp-260h],xmm1
002A15B7 movapd xmm0,xmmword ptr [ebp-260h]
002A15BF movapd xmmword ptr [factor],xmm0
term = _mm_mul_pd(term, x); // x^n -> x^{n+1}
002A15C7 movapd xmm0,xmmword ptr [x]
002A15CF movapd xmm1,xmmword ptr [term]
002A15D7 mulpd xmm1,xmm0
002A15DB movapd xmmword ptr [ebp-240h],xmm1
002A15E3 movapd xmm0,xmmword ptr [ebp-240h]
002A15EB movapd xmmword ptr [term],xmm0
term = _mm_div_pd(term, factor); // x^{n+1}/n -> x^{n+1}/(n+1)
002A15F3 movapd xmm0,xmmword ptr [factor]
002A15FB movapd xmm1,xmmword ptr [term]
002A1603 divpd xmm1,xmm0
002A1607 movapd xmmword ptr [ebp-220h],xmm1
002A160F movapd xmm0,xmmword ptr [ebp-220h]
002A1617 movapd xmmword ptr [term],xmm0
sum = _mm_add_pd(sum, term); // sum -> sum + x^{n+1}/(n+1)
002A161F movapd xmm0,xmmword ptr [term]
002A1627 movapd xmm1,xmmword ptr [sum]
002A162F addpd xmm1,xmm0
002A1633 movapd xmmword ptr [ebp-200h],xmm1
002A163B movapd xmm0,xmmword ptr [ebp-200h]
002A1643 movapd xmmword ptr [sum],xmm0
} while(i<=imax);
002A164B cmp dword ptr [i],6
002A164F jle main+102h (2A1592h)

_mm_store_pd(res, sum);
002A1655 movapd xmm0,xmmword ptr [sum]
002A165D movapd xmmword ptr [res],xmm0

It seems clear that, with the intrinsics instructions I used, the variables are saved in memory and reloaded into xmm registers at each step of the loop. This results in a loop of 29 machine codes, while the inline asm loop involves 7 op codes only.

I am everything but a programming expert (in fact, I am a physicist). Comments of experts would be appreciated. Are there intrinsics instructions that would avoid writing intermediate variable values into memory? Another question: visual studio c++ does not accept inline assembly in x64. How about Intel compilers?

12 posts / 0 new
Last post
For more complete information about compiler optimizations, see our Optimization Notice.
Best Reply

You must have compiled your sample with optimizations turned off (e.g. in Debug mode) - when compiled with default optimizations in Release mode (I used the VS2008 compiler) the loop code for your intrinsics example is only 7 instructions long, essentially identical to your inline asm example. In general I prefer to use intrinsics for most work: it makes life much easier when interfacing with the surrounding C++ code and with a little care it is usually as fast as inline assembler. In fact, it is often faster if you are not a highly experienced assembly language programmer: if you don't know the exact characteristics of each assembly language instruction, don't know how to arrange your instructions to best utilize issue ports and hide latency, pair instructions that can be fused, etc., then it is likely the compiler will do a better job translating intrinsics to efficient code than you'd do by writing the inline assembly directly. For example, the Intel compiler unrolls the loop in your code because it sees the constant imax=6 and recognizes that a loop isn't really needed. In addition, it rotates register usage and rearranges instructions in that inline code to hide latency and optimize throughput. The resulting code is much faster than the hand-coded loop you wrote (yes, I know it was an unrealistic example, but realistically it's unlikely you would do better by hand than the compiler).

So what I'd recommend is this:
1) Check your algorithm. A better algorithm is by far the easiest way to get a performance improvement.
2) Use somebody else's code :). Performance libraries like MKL (which includes BLAS, LAPACK, etc.), VML, and
IPP can save you a tremendous amount of time by providing
expertly-optimized implementations of many common (and many
not-so-common) mathematical and statistical methods. Don't fool yourself into thinking you can write faster matrix code than you can find in these libraries!
3) Look for parallelization opportunities. Cilk and TBB (available with Intel Parallel Studio) make it pretty easy to write simple parallel code (as do the parallel extensions in VS2010). Avoid tricky parallel code: you can spend hours debugging incredibly subtle race conditions and locking problems if you try to get too clever. MKL, VML, and IPP often use parallelization automatically (they use OpenMP internally).
4) If you must write the core code yourself, write it first in C/C++. This is easier to get correct, gets you a working program sooner, and later gives you a reference to check the correctness of your optimized code.
5) Profile the code running on a realistic data set. Find your hotspots (usually in the innermost loops).
6) Optimize those hotspots in order of priority using intrinsics. Compare the results produced by the intrinsics to the results produced by your C/C++ code, and compare the performance too. Note that it may be difficult to achieve a performance improvement: the compiler is often very good at optimizing C/C++ code directly. You may have a better chance of improving performance by improving locality of access (i.e. using blocked/tiled algorithms to improve cache utilization) than by writing with intrinsics.
7) Only in extreme cases rewrite code to use inline assembly. Again, if you're not a proficient ASM programmer then you probably won't get a performance improvement.

BTW, the Intel compiler does accept inline assembly in x64 (unlike the VS compiler). If you're looking for the fastest code, I'd recommend using the Intel compiler - Parallel Studio integrates fairly seamlessly within Visual Studio and includes Cilk, TBB, and the IPP library as well as a very nice performance profiling environment. Parallel Studio Ex (a little pricier) adds MKL, VML, and an even more capable performance profiling tool (VTune) - among other things.

Hope this helps!

Thanks a lot. This is a perfect answer. Of course, you are right. In release mode, intrinsics are OK. I didn't know that the codes in debug and release modes are so different. Thinking to this afterwards, loading the values of the variables into memory at each step in debug mode is quite natural since they must be available for debugging. Maybe we should stress here not to use debug mode for lengthy calculations. This is probably obvious to experts like you but could be overlooked by users like me for which programming is a tool ideally requiring as little time and effort as possible.

I agree that modern Intel compilers give better optimized codes that we can do by hand. BTW I'd be curious to see what improvement it can bring to my so short inline asm codes, except loop unrolling. At my university, the Intel compilers are located at a central facility and the access is mainly via command lines, which makes debug somewhat unpleasant. I'll try to get a disassembly. Any way it's just curiosity!

As much as possible, I try to use existing function libraries. I don't want to reinvent the wheel! Unfortunately, often, I don't find what I'm looking for. In my present research, I use Legendre toroidal functions and till now haven't found anything up to date.

Many thanks again. I hope I didn't take too much of your time.

Glad to help. If you want to see how efficiently the Intel compiler generates code, try this little function (implements your exp example as a vector function):

void vexp(double* __restrict ax, double* __restrict ares, int n)
{
const int imax = 6;
#pragma simd
for(int k=0;k {
double x=ax[k];
double term = x;
double factor=1.0;
double sum=1.0+term;

for(int i=0;i {
factor+=1.0;
term=term*x/factor;
sum+=term;
}
ares[k] = sum;
}
}

The __restrict keyword tells the compiler that the ax and ares array pointers don't alias (i.e. don't overlap) - this can significantly improve efficiency. The #pragma simd directive is a suggestion to the compiler to use SIMD instructions for the code that follows. You'll probably find that you can create efficient code more easily by learning about using these kinds of keywords and directives in standard C/C++ code than by using intrinsics: if you try to insert your intrinsics-based code in the sample above, you will probably find that you just slow things down.

BTW, if you substitute float for double in the sample, the efficiency goes up dramatically: not only is there half as much data to move, there is a SIMD reciprocal approximation instruction for floats that is much faster than the division instruction - the compiler knows this, and uses it when possible. You can learn a lot by looking at the assembly language generated by the compiler!

Hmm. I remembered something I had read in the documentation for #pragma simd: apparently it can produce incorrect code if the characteristics of variables in the following loop aren't described in extension clauses to the simd directive. I wouldn't have thought this would be an issue in such a simple case, but I compiled the snippet above and sure enough the output is incorrect when #pragma simd is used, but correct when #pragma simd is not used! Maybe it is a corner case for the compiler - apparently the inner loop causes some confusion, because if you unroll it manually everything works fine even with #pragma simd! Very odd.

So I guess this just reinforces a point I made earlier: always start with a simple non-optimized implementation of your algorithm first so you can verify correctness as you optimize! And don't assume helpful little things like #pragma simd are as simple and foolproof as they look :).

If anyone knows why #pragma simd is failing in such a simple case, I'd love to hear about it! The documentation is quite unhelpful - I tried a couple of things but couldn't come up with a way to coerce the compiler to generate correct code without manually unrolling the inner loop. It's a little worrisome that this directive changes the semantics of the loop code without issuing any warnings.

According to documentation, #pragma simd instructs the compiler to ignore even proven dependencies which would inhibit vectorization. It became more aggressive with the 12.1 compilers, and we have cases simpler than this which are broken. Accordingly, it's difficult to sort out where #pragma simd may be appropriate; a basic rule is to avoid it where combinations of restrict and #pragma ivdep should be sufficient.
There is a separate pragma for unroll-and-jam, in case that was the intent for putting the #pragma simd in the position shown. The usual usage for promotion of vector reduction would be to put #pragma simd on the inner reduction loop and specify the reduction and private variables. Of course, this example isn't suitable for vector reduction.

1. Regarding number of terms: Number of calculated terms of the series to calculate power of the Exponential constant affects accuracy and has to be taken into account;

2. Performance evaluations are as follows for 12 terms calculated:

( 10,000,000 calls for every implementation / 1 sec = 1000 ticks / All Optimizations disabled )

- Inline Assembler ~= 5,516 ticks
- Intrinsic Functions ~= 10,828 ticks
- Recursivein C ~= 19,188 ticks

3. A couple words about accuracy of calculations:

Let's assume that a true value of Exponential constant is 2.718281828459045 ( up to 15 digits ).

Then, if x equals to 1, here are calculations with different number of terms in the Exponential series for 'double' data type ( FPU init-settings - Default ):

Dbl Epsilon : 0.0000000000000002220446

Exp-constant is : 2.7182818284590450000000
x=1.0 -> E^ 1.0 is: 2.7182818284590451000000 Number of Terms: 18
Note: Limitation of IEEE 754 standardfor a double data typereached
x=1.0 -> E^ 1.0 is: 2.7182818284590451000000 Number of Terms: 17 - 15 digits matched
x=1.0 -> E^ 1.0 is: 2.7182818284590420000000 Number of Terms: 16 - 14 digits matched
x=1.0 -> E^ 1.0 is: 2.7182818284589945000000 Number of Terms: 15 - 11 digits matched
x=1.0 -> E^ 1.0 is: 2.7182818284582297000000 Number of Terms: 14 - 11 digits matched
x=1.0 -> E^ 1.0 is: 2.7182818284467589000000 Number of Terms: 13 - 10 digits matched
x=1.0 -> E^ 1.0 is: 2.7182818282861687000000 Number of Terms: 12 - 9 digits matched
x=1.0 -> E^ 1.0 is: 2.7182818261984929000000 Number of Terms: 11 - 8 digits matched
x=1.0 -> E^ 1.0 is: 2.7182818011463845000000 Number of Terms: 10 - 7 digits matched
x=1.0 -> E^ 1.0 is: 2.7182815255731922000000 Number of Terms: 9 - 6 digits matched
x=1.0 -> E^ 1.0 is: 2.7182787698412696000000 Number of Terms: 8 - 4 digits matched
x=1.0 -> E^ 1.0 is: 2.7182539682539684000000 Number of Terms: 7 - 4 digits matched
x=1.0 -> E^ 1.0 is: 2.7180555555555554000000 Number of Terms: 6 - 3 digits matched
x=1.0 -> E^ 1.0 is: 2.7166666666666668000000 Number of Terms: 5 - 2 digits matched

In order to provide the best double-precision accuracy of theExponential constant compared to the
true value the calculation has to be done with up to at least 17th term.

In case of calculations with up to 17th term accuracy is:

Abs Error: 0.000000000000000
Rel Error : 0.000000000000000
Pct Error : 0.000000000000000 %

In case of calculations with up to 16th term accuracy is:

Abs Error: -0.000000000000003
Rel Error : -0.000000000000001
Pct Error : -0.000000000000114 %

4. Regarding optimizations:Of course, Vectorization is agood thing, but it looks like everybody forgot some very simple ways to optimize an algorithm or algorithms...

- The classic expression to calculate the Exponential constant using series, that is

e^x = 1 + x + (x^2)/2! + (x^3)/3! + (x^4)/4! + ... ( *1 )

could be normalized in order to reduce number of multiplications when calculating powers of x;

- Factorials could be replaced by constants, 2!=2, 3!=6, 4!=24, etc, in order to speed up
calculations;

- Calculations could be done backwards starting withhigher terms ( smaller values)in order to minimize rounding errors and improve accuracy of the final value;

- Instruction 'mov eax, 1' could be replaced with 'xor eax, eax'.

5. Do you know, that the Exponential constant could be calculated using Horner's Rule?
Here is an example with the same accuracy as the classic expression ( *1 )up to 12th term:

...
dExpCalcV = ((((((((((( 1.0L + 1.0L / 12.0L ) / 11.0L + 1.0L ) / 10.0L + 1.0L ) / 9.0L +
1.0L ) / 8.0L + 1.0L ) / 7.0L + 1.0L ) / 6.0L +
1.0L ) / 5.0L + 1.0L ) / 4.0L + 1.0L ) / 3.0L + 1.0L ) / 2.0L + 1.0L ) + 1.0L;
...

6. Here are some real results:

Best regards,
Sergey

I detected a little issue in a Sub-Test 4 ( for Intrinsic Functions )and correct results are as follows:

...
Sub-Test 4 - Using Intrinsic Functions
Exp-constant is : 2.718281828459045100
Intrinsic-Based completed in: 11844 ticks
Exp-constant is : 2.718281828286168700 Rel Error: -0.000000000063597666 ( 12 terms accuracy )
Exp-constant is : 2.718281828286168700 Rel Error: -0.000000000063597666 ( 12 terms accuracy )
...

In case of calculation of the Exponential constant using the series expression up to 12th term accuracy between all three implementations is the same:

...
...Rel Error: -0.000000000063597666 ( 12 terms accuracy )...
...

So I initiated a discussion between experts on the use on #pragma simd! Unfortunately the documentation on subtleties like these #pragma's and even on more basic aspects are often either hard to find or mostly unhelpful to the user whose programming expertise is limited to the strict necessity for his applications. I totally agree with you on why develop some code by hand when the compiler can write it in an optimized form.
Thanks again for your help.

>>...why develop some code by hand when the compiler can write it in an optimized form...

That's a "sad"reality of Modern Software Development whenSoftware Developers, even experienced (!!!), too reliant on software optimizations by a C/C++ compiler.

Whenthere are1,000 code lines in C, or in C++, or in Assembler,instead of 100 it's an illusion that a C/C++ compiler will makethat codesignificantly faster by enabling some "cool" optimization(s). Of course, itcould make it faster, but not too significantly.

I'm forced to doa highly portable software programming due to strict requirements of my current project and when it comes to optimizations they are always Disabled in Debug and Release configurations.

Quoting Sergey Kostrov

I'm forced to doa highly portable software programming due to strict requirements of my current project and when it comes to optimizations they are always Disabled in Debug and Release configurations.

That's an extreme position.
Options such as Intel /fp:source and options to turn off interprocedural analysis are provided to meet requirements of portability.

>>...Options such as Intel /fp:source and options to turn off interprocedural analysis are provided
>>to meet requirements of portability...

That's good to know. Thank you for your feedback, Tim!

Leave a Comment

Please sign in to add a comment. Not a member? Join today