Fastest version of a CRT-function 'memset'

Fastest version of a CRT-function 'memset'

*** Fastest version of a CRT-function 'memset' ***

[ Abstract ]

Moder C++ compilers allow to inline most CRT-functions. For example, Microsoft and Intel
C++ compilers have 'Enable Intrinsic Functions' option ( /Oi ). When that option is used
a C++ compiler generates highly optimized binary codes instead of calling a CRT-function
from a Run-Time Dynamic Link Library.

An analysis of several C++ compilers was completed in order to evaluate how they deal with
a simple call to CRT-function memset ( initializes a block of memory with a value ).

Zone: 

53 posts / 0 new
Last post
For more complete information about compiler optimizations, see our Optimization Notice.

[ Test Case - C codes ]

...
typedef struct tagALIGNOFDATA
{
RTint iAlignofValue[9];
RTtchar *pszTypeName[9];
} ALIGNOFDATA;

ALIGNOFDATA aod;

CrtMemset( &aod, 0x0, sizeof( ALIGNOFDATA ) );
...

...
_RTINLINE RTvoid * CrtMemset( RTvoid *pvDest, RTint iValue, RTsize_t iCount )
{
_RTvolatile RTuint64 uiClock1 = IrtRdtsc();
memset( pvDest, iValue, iCount );
_RTvolatile RTuint64 uiClock2 = IrtRdtsc();
CrtPrintf( RTU("[ CrtMemset ] - Executed in %u clock cycles\n"),
( RTuint )( uiClock2 - uiClock1 ) );
return ( RTvoid * )pvDest;
}
...

[ Evaluation was done using eight C++ compilers ]

// 32-bit C++ compilers
Microsoft C++ compiler ( VS2005 PE ) 32-bit
Borland C++ compiler v5.5.1 32-bit
Intel C++ compiler v12.1.7 ( u371 ) 32-bit
MinGW C++ compiler v5.1.0 32-bit
Watcom C++ compiler v2.0.0 32-bit

// 64-bit C++ compilers
Microsoft C++ compiler ( VS2008 PE ) 64-bit
Intel C++ compiler v13.1.0 ( u149 ) 64-bit
MinGW C++ compiler v5.1.0 64-bit

[ Microsoft C++ compiler ( VS2005 PE ) 32-bit - Debug Binary codes ]

...
100143EE rdtsc
100143F0 mov dword ptr [uiClock1], eax
100143F3 mov dword ptr [ebp-8], edx
100143F6 mov eax, dword ptr [iCount]
100143F9 push eax
100143FA mov ecx, dword ptr [iValue]
100143FD push ecx
100143FE mov edx, dword ptr [pvDest]
10014401 push edx
10014402 call @ILT+400(_memset) (10011195h)
10014407 add esp, 0Ch
1001440A rdtsc
...

[ Microsoft C++ compiler ( VS2005 PE ) 32-bit - Release Binary codes ]

...
00403583 rdtsc
00403585 push 48h
00403587 lea ecx, [esp+4Ch]
0040358B push 0
0040358D mov dword ptr [esp+20h], eax
00403591 mov dword ptr [esp+24h], edx
00403595 push ecx
00403596 call 004066B0
0040359B rdtsc
...

[ Borland C++ compiler v5.5.1 32-bit - Debug Binary codes ]

...
00403D77 call 004047FC
00403D7C mov dword ptr [ebp-8], eax
00403D7F mov dword ptr [ebp-4], edx
00403D82 push dword ptr [ebp+10h]
00403D85 push dword ptr [ebp+0Ch]
00403D88 push dword ptr [ebp+8]
00403D8B call 00405AB4
00403D90 add esp, 0Ch
00403D93 call 004047FC
...

[ Borland C++ compiler v5.5.1 32-bit - Release Binary codes ]

...
00402E9E call 00404550
00402EA3 mov dword ptr [ebp-19Ch], eax
00402EA9 mov dword ptr [ebp-198h], edx
00402EAF push 48h
00402EB1 push 0
00402EB3 lea eax, [ebp-2F4h]
00402EB9 push eax
00402EBA call 00405838
00402EBF add esp, 0Ch
00402EC2 call 00404550
...

[ Intel C++ compiler v12.1.7 ( u371 ) 32-bit - Debug Binary codes ]

...
00402438 rdtsc
0040243A mov dword ptr [ebp-20h], eax
0040243D mov dword ptr [ebp-1Ch], edx
00402440 mov byte ptr [ebp-2Ch], 1
00402444 mov eax, dword ptr [ebp-20h]
00402447 mov edx, dword ptr [ebp-1Ch]
0040244A mov dword ptr [uiClock1], eax
0040244D mov dword ptr [ebp-14h], edx
00402450 add esp, 0FFFFFFF4h
00402453 mov eax, dword ptr [pvDest]
00402456 mov dword ptr [esp], eax
00402459 mov eax, dword ptr [iValue]
0040245C mov dword ptr [esp+4], eax
00402460 mov eax, dword ptr [iCount]
00402463 mov dword ptr [esp+8], eax
00402467 call memset (414770h)
0040246C add esp, 0Ch
0040246F mov dword ptr [ebp-28h], eax
00402472 rdtsc
...

[ Intel C++ compiler v12.1.7 ( u371 ) 32-bit - Release Binary codes ]

...
00402DEE rdtsc
00402DF0 mov dword ptr [ebp-208h], eax
00402DF6 mov dword ptr [ebp-204h], edx
00402DFC pxor xmm0, xmm0
00402E00 movaps xmmword ptr [ebp-1F8h], xmm0
00402E07 movaps xmmword ptr [ebp-1E8h], xmm0
00402E0E movaps xmmword ptr [ebp-1D8h], xmm0
00402E15 movaps xmmword ptr [ebp-1C8h], xmm0
00402E1C movq mmword ptr [ebp-1B8h], xmm0
00402E24 rdtsc
...

[ MinGW C++ compiler v5.1.0 32-bit - Debug Binary codes ]

...
00406459 rdtsc
0040645E mov dword ptr [ebp-10h], eax
00406461 mov dword ptr [ebp-0Ch], edx
00406464 mov eax, dword ptr [ebp+10h]
00406467 mov dword ptr [esp+8], eax
0040646B mov eax, dword ptr [ebp+0Ch]
0040646E mov dword ptr [esp+4], eax
00406472 mov eax, dword ptr [ebp+8]
00406475 mov dword ptr [esp], eax
00406478 call 00406124
0040647D rdtsc
...

[ MinGW C++ compiler v5.1.0 32-bit - Release Binary codes ]

...
00401F78 rdtsc
00401F7A mov dword ptr [esp+100h], eax
00401F81 mov dword ptr [esp+104h], edx
00401F88 mov ebx, dword ptr [esp+100h]
00401F8F mov edx, dword ptr [esp+104h]
00401F96 mov eax, dword ptr [esp+0F8h]
00401F9D mov dword ptr [esp], 40B89Ch
00401FA4 mov ecx, dword ptr [esp+0FCh]
00401FAB sub ebx, eax
00401FAD mov dword ptr [esp+4], ebx
00401FB1 call 004079DC
00401FB6 rdtsc
...

[ Watcom C++ compiler v2.0.0 32-bit - Debug Binary codes ]

...
0040560F rdtsc
00405611 mov ecx, eax
00405613 mov eax, edx
00405615 mov dword ptr [ebp-110h], ecx
0040561B mov dword ptr [ebp-10Ch], eax
00405621 mov ebx, dword ptr [ebp-108h]
00405627 mov edx, dword ptr [ebp-104h]
0040562D mov eax, dword ptr [ebp-100h]
00405633 call 0040A1A0
00405638 rdtsc
...

[ Watcom C++ compiler v2.0.0 32-bit - Release Binary codes ]

...
00402D13 rdtsc
00402D15 mov dword ptr [esp+1D0h], eax
00402D1C mov dword ptr [esp+1D4h], edx
00402D23 mov ebx, 48h
00402D28 mov eax, esp
00402D2A xor edx, edx
00402D2C call 00404D50
00402D31 rdtsc
...

[ Microsoft C++ compiler ( VS2008 PE ) 64-bit - Debug Binary codes ]

...
0000000180003F0C rdtsc
0000000180003F0E shl rdx, 20h
0000000180003F12 or rax, rdx
0000000180003F15 mov qword ptr [uiClock1], rax
0000000180003F1A mov r8, qword ptr [iCount]
0000000180003F1F mov edx, dword ptr [iValue]
0000000180003F23 mov rcx, qword ptr [pvDest]
0000000180003F28 call memset (18000B1A2h)
0000000180003F2D rdtsc
...

[ Microsoft C++ compiler ( VS2008 PE ) 64-bit - Release Binary codes ]

...
0000000140003A8C rdtsc
0000000140003A8E shl rdx, 20h
0000000140003A92 lea rcx, [rbp+10h]
0000000140003A96 or rax, rdx
0000000140003A99 xor edx, edx
0000000140003A9B lea r8d, [rdx+70h]
0000000140003A9F mov qword ptr [rbp], rax
0000000140003AA3 call 000000014000BBC0
0000000140003AA8 rdtsc
...

[ Intel C++ compiler v13.1.0 ( u149 ) 64-bit - Debug Binary codes ]

...
000000013FBF291C rdtsc
000000013FBF291E shl rdx, 20h
000000013FBF2922 or rax, rdx
000000013FBF2925 mov qword ptr [rbp+8], rax
000000013FBF2929 mov byte ptr [rbp], 1
000000013FBF292D mov rax, qword ptr [rbp+8]
000000013FBF2931 mov qword ptr [uiClock1], rax
000000013FBF2935 mov rax, qword ptr [pvDest]
000000013FBF2939 mov edx, dword ptr [iValue]
000000013FBF293C mov rcx, qword ptr [iCount]
000000013FBF2940 mov qword ptr [rbp+40h], rcx
000000013FBF2944 mov rcx, rax
000000013FBF2947 mov rax, qword ptr [rbp+40h]
000000013FBF294B mov r8, rax
000000013FBF294E call memset (13FC08BE0h)
000000013FBF2953 mov qword ptr [rbp+18h], rax
000000013FBF2957 rdtsc
...

[ Intel C++ compiler v13.1.0 ( u149 ) 64-bit - Release Binary codes ]

...
000000013F683552 rdtsc
000000013F683554 shl rdx, 20h
000000013F683558 or rax, rdx
000000013F68355B mov qword ptr [rbp+1D0h], rax
000000013F683562 vmovups xmmword ptr [r13+10h], xmm6
000000013F683568 vmovups xmmword ptr [r13+20h], xmm6
000000013F68356E vmovups xmmword ptr [r13+30h], xmm6
000000013F683574 vmovups xmmword ptr [r13+40h], xmm6
000000013F68357A vmovups xmmword ptr [r13+50h], xmm6
000000013F683580 vmovups xmmword ptr [r13+60h], xmm6
000000013F683586 vmovups xmmword ptr [r13], xmm6
000000013F68358C rdtsc
...

[ MinGW C++ compiler v5.1.0 64-bit - Debug Binary codes ]

...
000000000040728C call 0000000000407260
0000000000407291 mov qword ptr [rbp-8], rax
0000000000407295 mov rdx, qword ptr [rbp+20h]
0000000000407299 mov eax, dword ptr [rbp+18h]
000000000040729C mov r8, rdx
000000000040729F mov edx, eax
00000000004072A1 mov rcx, qword ptr [rbp+10h]
00000000004072A5 call 0000000000406DC8
00000000004072AA call 0000000000407260
...

[ MinGW C++ compiler v5.1.0 64-bit - Release Binary codes ]

...
0000000000402DD6 rdtsc
0000000000402DD8 shl rdx, 20h
0000000000402DDC or rax, rdx
0000000000402DDF mov qword ptr [rbp+38h], rax
0000000000402DE3 mov rdx, qword ptr [rbp+38h]
0000000000402DE7 mov rcx, qword ptr [rbp+30h]
0000000000402DEB sub edx, ecx
0000000000402DED lea rcx, [40B938h]
0000000000402DF4 call 0000000000407690
0000000000402DFB rdtsc
...

[ Performance Evaluation ( Debug ) - Summary - 32-bit Windows XP SP3 ]

Microsoft C++ compiler ( VS2005 PE ) 32-bit

...
[ CrtMemset ] - Executed in 316 clock cycles
[ CrtMemset ] - Executed in 424 clock cycles
[ CrtMemset ] - Executed in 388 clock cycles
[ CrtMemset ] - Executed in 344 clock cycles
[ CrtMemset ] - Executed in 336 clock cycles
[ CrtMemset ] - Executed in 336 clock cycles
[ CrtMemset ] - Executed in 336 clock cycles
[ CrtMemset ] - Executed in 372 clock cycles
[ CrtMemset ] - Executed in 336 clock cycles
[ CrtMemset ] - Executed in 332 clock cycles
...

Borland C++ compiler v5.5.1 32-bit

...
[ CrtMemset ] - Executed in 536 clock cycles
[ CrtMemset ] - Executed in 188 clock cycles
[ CrtMemset ] - Executed in 184 clock cycles
[ CrtMemset ] - Executed in 188 clock cycles
[ CrtMemset ] - Executed in 188 clock cycles
[ CrtMemset ] - Executed in 184 clock cycles
[ CrtMemset ] - Executed in 188 clock cycles
[ CrtMemset ] - Executed in 184 clock cycles
[ CrtMemset ] - Executed in 184 clock cycles
[ CrtMemset ] - Executed in 188 clock cycles
...

Intel C++ compiler v12.1.7 ( u371 ) 32-bit

...
[ CrtMemset ] - Executed in 344 clock cycles
[ CrtMemset ] - Executed in 296 clock cycles
[ CrtMemset ] - Executed in 184 clock cycles
[ CrtMemset ] - Executed in 180 clock cycles
[ CrtMemset ] - Executed in 180 clock cycles
[ CrtMemset ] - Executed in 184 clock cycles
[ CrtMemset ] - Executed in 180 clock cycles
[ CrtMemset ] - Executed in 184 clock cycles
[ CrtMemset ] - Executed in 180 clock cycles
[ CrtMemset ] - Executed in 172 clock cycles
...

MinGW C++ compiler v5.1.0 32-bit

...
[ CrtMemset ] - Executed in 728 clock cycles
[ CrtMemset ] - Executed in 412 clock cycles
[ CrtMemset ] - Executed in 324 clock cycles
[ CrtMemset ] - Executed in 320 clock cycles
[ CrtMemset ] - Executed in 328 clock cycles
[ CrtMemset ] - Executed in 320 clock cycles
[ CrtMemset ] - Executed in 324 clock cycles
[ CrtMemset ] - Executed in 324 clock cycles
[ CrtMemset ] - Executed in 320 clock cycles
[ CrtMemset ] - Executed in 328 clock cycles
...

Watcom C++ compiler v2.0.0 32-bit

...
[ CrtMemset ] - Executed in 784 clock cycles
[ CrtMemset ] - Executed in 268 clock cycles
[ CrtMemset ] - Executed in 260 clock cycles
[ CrtMemset ] - Executed in 264 clock cycles
[ CrtMemset ] - Executed in 264 clock cycles
[ CrtMemset ] - Executed in 464 clock cycles
[ CrtMemset ] - Executed in 256 clock cycles
[ CrtMemset ] - Executed in 264 clock cycles
[ CrtMemset ] - Executed in 264 clock cycles
[ CrtMemset ] - Executed in 260 clock cycles
...

[ Performance Evaluation ( Debug ) - Final Results - 32-bit Windows XP SP3 ]

Average is 208 clock cycles - Intel C++ compiler v12.1.7 ( u371 ) 32-bit
Average is 221 clock cycles - Borland C++ compiler v5.5.1 32-bit
Average is 335 clock cycles - Watcom C++ compiler v2.0.0 32-bit
Average is 352 clock cycles - Microsoft C++ compiler ( VS2005 PE ) 32-bit
Average is 373 clock cycles - MinGW C++ compiler v5.1.0 32-bit

[ Performance Evaluation ( Release ) - Summary - 32-bit Windows XP SP3 ]

Microsoft C++ compiler ( VS2005 PE ) 32-bit

...
[ CrtMemset ] - Executed in 836 clock cycles
[ CrtMemset ] - Executed in 380 clock cycles
[ CrtMemset ] - Executed in 380 clock cycles
[ CrtMemset ] - Executed in 380 clock cycles
[ CrtMemset ] - Executed in 380 clock cycles
[ CrtMemset ] - Executed in 376 clock cycles
[ CrtMemset ] - Executed in 388 clock cycles
[ CrtMemset ] - Executed in 380 clock cycles
[ CrtMemset ] - Executed in 380 clock cycles
[ CrtMemset ] - Executed in 376 clock cycles
...

Borland C++ compiler v5.5.1 32-bit

...
[ CrtMemset ] - Executed in 256 clock cycles
[ CrtMemset ] - Executed in 492 clock cycles
[ CrtMemset ] - Executed in 180 clock cycles
[ CrtMemset ] - Executed in 180 clock cycles
[ CrtMemset ] - Executed in 190 clock cycles
[ CrtMemset ] - Executed in 172 clock cycles
[ CrtMemset ] - Executed in 172 clock cycles
[ CrtMemset ] - Executed in 172 clock cycles
[ CrtMemset ] - Executed in 180 clock cycles
[ CrtMemset ] - Executed in 180 clock cycles
...

Intel C++ compiler v12.1.7 ( u371 ) 32-bit

...
[ CrtMemset ] - Executed in 88 clock cycles
[ CrtMemset ] - Executed in 88 clock cycles
[ CrtMemset ] - Executed in 88 clock cycles
[ CrtMemset ] - Executed in 88 clock cycles
[ CrtMemset ] - Executed in 88 clock cycles
[ CrtMemset ] - Executed in 89 clock cycles
[ CrtMemset ] - Executed in 88 clock cycles
[ CrtMemset ] - Executed in 88 clock cycles
[ CrtMemset ] - Executed in 88 clock cycles
[ CrtMemset ] - Executed in 88 clock cycles
...

MinGW C++ compiler v5.1.0 32-bit

...
[ CrtMemset ] - Executed in 184 clock cycles
[ CrtMemset ] - Executed in 162 clock cycles
[ CrtMemset ] - Executed in 184 clock cycles
[ CrtMemset ] - Executed in 184 clock cycles
[ CrtMemset ] - Executed in 184 clock cycles
[ CrtMemset ] - Executed in 184 clock cycles
[ CrtMemset ] - Executed in 184 clock cycles
[ CrtMemset ] - Executed in 184 clock cycles
[ CrtMemset ] - Executed in 184 clock cycles
[ CrtMemset ] - Executed in 156 clock cycles
...

Watcom C++ compiler v2.0.0 32-bit

...
[ CrtMemset ] - Executed in 736 clock cycles
[ CrtMemset ] - Executed in 212 clock cycles
[ CrtMemset ] - Executed in 260 clock cycles
[ CrtMemset ] - Executed in 212 clock cycles
[ CrtMemset ] - Executed in 220 clock cycles
[ CrtMemset ] - Executed in 212 clock cycles
[ CrtMemset ] - Executed in 212 clock cycles
[ CrtMemset ] - Executed in 260 clock cycles
[ CrtMemset ] - Executed in 264 clock cycles
[ CrtMemset ] - Executed in 212 clock cycles
...

[ Performance Evaluation ( Release ) - Final Results - 32-bit Windows XP SP3 ]

Average is 088 clock cycles - Intel C++ compiler v12.1.7 ( u371 ) 32-bit
Average is 179 clock cycles - MinGW C++ compiler v5.1.0 32-bit
Average is 217 clock cycles - Borland C++ compiler v5.5.1 32-bit
Average is 280 clock cycles - Watcom C++ compiler v2.0.0 32-bit
Average is 426 clock cycles - Microsoft C++ compiler ( VS2005 PE ) 32-bit

[ Performance Evaluation ( Debug ) - Summary - 64-bit Windows 7 SP1 ]

Microsoft C++ compiler ( VS2008 PE ) 64-bit

...
[ CrtMemset ] - Executed in 324 clock cycles
[ CrtMemset ] - Executed in 196 clock cycles
[ CrtMemset ] - Executed in 204 clock cycles
[ CrtMemset ] - Executed in 176 clock cycles
[ CrtMemset ] - Executed in 176 clock cycles
[ CrtMemset ] - Executed in 176 clock cycles
[ CrtMemset ] - Executed in 176 clock cycles
[ CrtMemset ] - Executed in 176 clock cycles
[ CrtMemset ] - Executed in 176 clock cycles
[ CrtMemset ] - Executed in 176 clock cycles
...

Intel C++ compiler v13.1.0 ( u149 ) 64-bit

...
[ CrtMemset ] - Executed in 336 clock cycles
[ CrtMemset ] - Executed in 56 clock cycles
[ CrtMemset ] - Executed in 52 clock cycles
[ CrtMemset ] - Executed in 60 clock cycles
[ CrtMemset ] - Executed in 60 clock cycles
[ CrtMemset ] - Executed in 44 clock cycles
[ CrtMemset ] - Executed in 60 clock cycles
[ CrtMemset ] - Executed in 60 clock cycles
[ CrtMemset ] - Executed in 60 clock cycles
[ CrtMemset ] - Executed in 60 clock cycles
...

MinGW C++ compiler v5.1.0 64-bit

...
[ CrtMemset ] - Executed in 252 clock cycles
[ CrtMemset ] - Executed in 216 clock cycles
[ CrtMemset ] - Executed in 216 clock cycles
[ CrtMemset ] - Executed in 216 clock cycles
[ CrtMemset ] - Executed in 204 clock cycles
[ CrtMemset ] - Executed in 184 clock cycles
[ CrtMemset ] - Executed in 180 clock cycles
[ CrtMemset ] - Executed in 204 clock cycles
[ CrtMemset ] - Executed in 180 clock cycles
[ CrtMemset ] - Executed in 184 clock cycles
...

[ Performance Evaluation ( Debug ) - Final Results - 64-bit Windows 7 SP1 ]

Average is 085 clock cycles - Intel C++ compiler v13.1.0 ( u149 ) 64-bit
Average is 196 clock cycles - Microsoft C++ compiler ( VS2008 PE ) 64-bit
Average is 204 clock cycles - MinGW C++ compiler v5.1.0 64-bit

[ Performance Evaluation ( Release ) - Summary - 64-bit Windows 7 SP1 ]

Microsoft C++ compiler ( VS2008 PE ) 64-bit

...
[ CrtMemset ] - Executed in 172 clock cycles
[ CrtMemset ] - Executed in 100 clock cycles
[ CrtMemset ] - Executed in 100 clock cycles
[ CrtMemset ] - Executed in 100 clock cycles
[ CrtMemset ] - Executed in 124 clock cycles
[ CrtMemset ] - Executed in 116 clock cycles
[ CrtMemset ] - Executed in 96 clock cycles
[ CrtMemset ] - Executed in 88 clock cycles
[ CrtMemset ] - Executed in 88 clock cycles
[ CrtMemset ] - Executed in 112 clock cycles
...

Intel C++ compiler v13.1.0 ( u149 ) 64-bit

...
[ CrtMemset ] - Executed in 28 clock cycles
[ CrtMemset ] - Executed in 28 clock cycles
[ CrtMemset ] - Executed in 28 clock cycles
[ CrtMemset ] - Executed in 28 clock cycles
[ CrtMemset ] - Executed in 28 clock cycles
[ CrtMemset ] - Executed in 28 clock cycles
[ CrtMemset ] - Executed in 24 clock cycles
[ CrtMemset ] - Executed in 24 clock cycles
[ CrtMemset ] - Executed in 28 clock cycles
[ CrtMemset ] - Executed in 24 clock cycles
...

MinGW C++ compiler v5.1.0 64-bit

...
[ CrtMemset ] - Executed in 28 clock cycles
[ CrtMemset ] - Executed in 28 clock cycles
[ CrtMemset ] - Executed in 28 clock cycles
[ CrtMemset ] - Executed in 24 clock cycles
[ CrtMemset ] - Executed in 28 clock cycles
[ CrtMemset ] - Executed in 28 clock cycles
[ CrtMemset ] - Executed in 28 clock cycles
[ CrtMemset ] - Executed in 28 clock cycles
[ CrtMemset ] - Executed in 28 clock cycles
[ CrtMemset ] - Executed in 48 clock cycles
...

[ Performance Evaluation ( Release ) - Final Results - 64-bit Windows 7 SP1 ]

Average is 027 clock cycles - Intel C++ compiler v13.1.0 ( u149 ) 64-bit
Average is 030 clock cycles - MinGW C++ compiler v5.1.0 64-bit
Average is 110 clock cycles - Microsoft C++ compiler ( VS2008 PE ) 64-bit

[ Intel C++ compiler v12.1.7 ( u371 ) 32-bit - Release ]
[ Optimized version ]

...
00402DEE rdtsc
00402DF0 mov dword ptr [ebp-208h], eax
00402DF6 mov dword ptr [ebp-204h], edx
00402DFC pxor xmm0, xmm0
00402E00 movaps xmmword ptr [ebp-1F8h], xmm0
00402E07 movaps xmmword ptr [ebp-1E8h], xmm0
00402E0E movaps xmmword ptr [ebp-1D8h], xmm0
00402E15 movaps xmmword ptr [ebp-1C8h], xmm0
00402E1C movq mmword ptr [ebp-1B8h], xmm0
00402E24 rdtsc
...

[ Intel C++ compiler v12.1.7 ( u371 ) 32-bit - Release ]
[ Non Optimized version - #pragma optimize( "", off ) was used ]

...
0040261A rdtsc
0040261C mov dword ptr [ebp-20h], eax
0040261F mov dword ptr [ebp-1Ch], edx
00402622 mov eax, dword ptr [ebp-20h]
00402625 mov edx, dword ptr [ebp-1Ch]
00402628 mov dword ptr [ebp-18h], eax
0040262B mov dword ptr [ebp-14h], edx
0040262E mov eax, dword ptr [ebp+8]
00402631 mov edx, dword ptr [ebp+0Ch]
00402634 mov ecx, dword ptr [ebp+10h]
00402637 mov edi, eax
00402639 mov eax, edx
0040263B and eax, 0FFFFh
00402640 mov ah, al
00402642 mov edx, eax
00402644 shl eax, 10h
00402647 or eax, edx
00402649 mov esi, ecx
0040264B shr ecx, 2
0040264E mov edx, edi
00402650 rep stos dword ptr es:[edi]
00402652 mov ecx, esi
00402654 and ecx, 3
00402657 rep stos byte ptr es:[edi]
00402659 mov eax, edx
0040265B mov dword ptr [ebp-30h], eax
0040265E mov eax, dword ptr [ebp-30h]
00402661 mov dword ptr [ebp-34h], eax
00402664 rdtsc
...

[ Intel C++ compiler v13.1.0 ( u149 ) 64-bit - Release Binary codes ]

...
000000013FEB3552 rdtsc
000000013FEB3554 shl rdx, 20h
000000013FEB3558 or rax, rdx
000000013FEB355B mov qword ptr [rbp+1D0h], rax
000000013FEB3562 vmovups xmmword ptr [r13+10h], xmm6
000000013FEB3568 vmovups xmmword ptr [r13+20h], xmm6
000000013FEB356E vmovups xmmword ptr [r13+30h], xmm6
000000013FEB3574 vmovups xmmword ptr [r13+40h], xmm6
000000013FEB357A vmovups xmmword ptr [r13+50h], xmm6
000000013FEB3580 vmovups xmmword ptr [r13+60h], xmm6
000000013FEB3586 vmovups xmmword ptr [r13], xmm6
000000013FEB358C rdtsc
000000013FEB358E shl rdx, 20h
000000013FEB3592 or rax, rdx
000000013FEB3595 mov qword ptr [rbp+1D8h], rax
000000013FEB359C lea rcx, [13FECF640h]
000000013FEB35A3 mov rdx, qword ptr [rbp+1D8h]
000000013FEB35AA mov rbx, qword ptr [rbp+1D0h]
000000013FEB35B1 sub rdx, rbx
000000013FEB35B4 vzeroupper
000000013FEB35B7 call 000000013FEB6950
...

[ Conclusion ]

For the given Test Case the most efficient binary codes generation and fastest
initialization of a block of memory at run-time was done by Intel C++ compiler ( 32-bit and
64-bit versions ).

Legacy 32-bit C++ compilers from Borland and Watcom outperformed 32-bit Microsoft C++
compiler but it doesn't mean that in a more complex test they will be competitive because
modern C++ compilers have built-in support of SIMD technology and these two legacy C++
compilers don't.

[ Command Line Options of C++ compilers ]

Command Line Options of C++ compilers used in these performance evaluations ( for Release configurations ) will be provided.

[ Borland C++ compiler v5.5.1 32-bit ]

-d -O2 -w -D_WIN32_BCC -DNDEBUG -5 -nRelease -eBccTestApp.exe -I"C:\WorkLib\MKL\Include" -L"C:\WorkLib\MKL\Lib\Ia32Bcc" -lS:33554432 BccTestApp.cpp HrtALLib.asm

[ MinGW C++ compiler v5.1.0 32-bit ]

MgwTestApp.cpp

-DNDEBUG

-O3

-msse2
-mprfchw

-ffast-math
-fpeel-loops
-ftree-vectorizer-verbose=0
-ftree-vectorize
-fvect-cost-model
-fomit-frame-pointer
-flto
-fwhole-program
-fopenmp

-w

-I "C:/WorkLib/ICC2011/Composer XE/Mkl/Include"
-B "../../AppsSca"

"C:/WorkLib/ICC2011/Composer XE/Mkl/Lib/Ia32/mkl_rt.lib"

-Xlinker
--stack=67108864

[ MinGW C++ compiler v5.1.0 64-bit ]

MgwTestApp.cpp

-DNDEBUG

-O3

-mavx
-mprfchw

-ffast-math
-fpeel-loops
-ftree-vectorizer-verbose=0
-ftree-vectorize
-fvect-cost-model
-fomit-frame-pointer
-fwhole-program
-fopenmp

-w

-I "C:/WorkLib/ICC2013/Composer XE/Mkl/Include"
-B "../../AppsSca"

"C:/WorkLib/ICC2013/Composer XE/Mkl/Lib/Intel64/mkl_rt.lib"

-Xlinker
--stack=1073741824

[ Microsoft C++ compiler ( VS2005 PE ) 32-bit ]

[ Compiler ]
/O2 /Ob1 /Oi /Ot /Oy /GL /I "..\..\Include" /D "WIN32" /D "_CONSOLE" /D "NDEBUG" /D "_WIN32_MSC" /D "_VC80_UPGRADE=0x0710" /D "_UNICODE" /D "UNICODE" /GF /Gm /MT /GS- /fp:fast /GR- /openmp /Yu"Stdphf.h" /Fp"Release\MscTestApp.pch" /Fo"Release/" /Fd"Release/" /W4 /nologo /c /Wp64 /Zi /Gd /TP /wd4005 /U "_WINCE_MSC" /U "WIN32_PLATFORM_PSPC" /U "WIN32_PLATFORM_WFSP" /U "WIN32_PLATFORM_WM50" /U "_WIN32_MGW" /U "_WIN32_BCC" /U "_COS16_TCC" /U "_WIN32_ICC" /U "_WIN32_WCC" /errorReport:prompt /arch:SSE2

[ Linker ]
/OUT:"Release/MscTestApp.exe" /INCREMENTAL:NO /NOLOGO /MANIFEST /MANIFESTFILE:"Release\MscTestApp.exe.intermediate.manifest" /NODEFAULTLIB:"../../Bin/Release/ScaLib.lib" /SUBSYSTEM:CONSOLE /STACK:268435456 /LARGEADDRESSAWARE /LTCG /MACHINE:X86 /ERRORREPORT:PROMPT kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib "..\..\bin\release\scalib.lib"

[ Microsoft C++ compiler ( VS2008 PE ) 64-bit ]

[ Compiler ]
/O2 /Ob1 /Oi /Ot /Oy /GL /I "..\..\Include" /D "WIN32" /D "_CONSOLE" /D "NDEBUG" /D "_WIN32_MSC" /D "_UNICODE" /D "UNICODE" /GF /Gm /MT /GS- /fp:fast /GR- /openmp /Yu"Stdphf.h" /Fp"x64\Release\ScaLibTestApp64.pch" /Fo"x64/Release/" /Fd"x64/Release/" /W4 /nologo /c /Zi /TP /wd4005 /U "_WINCE_MSC" /U "WIN32_PLATFORM_PSPC" /U "WIN32_PLATFORM_WFSP" /U "WIN32_PLATFORM_WM50" /U "_WIN32_MGW" /U "_WIN32_BCC" /U "_COS16_TCC" /U "_WIN32_ICC" /U "_WIN32_WCC" /errorReport:prompt

[ Linker ]
/OUT:"x64\Release/ScaLibTestApp64.exe" /INCREMENTAL:NO /NOLOGO /MANIFEST /MANIFESTFILE:"x64\Release\ScaLibTestApp64.exe.intermediate.manifest" /MANIFESTUAC:"level='asInvoker' uiAccess='false'" /SUBSYSTEM:CONSOLE /STACK:1073741824 /LTCG /DYNAMICBASE:NO /MACHINE:X64 /ERRORREPORT:PROMPT kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib "..\..\bin\release\scalib64.lib"

[ Intel C++ compiler v12.1.7 ( u371 ) 32-bit ]

[ Compiler ]
/c /O3 /Ob1 /Oi /Ot /Oy /Qipo /I "..\..\Include" /D "WIN32" /D "_CONSOLE" /D "NDEBUG" /D "_WIN32_ICC" /D "INTEL_SUITE_VERSION=PE121_300" /D "_VC80_UPGRADE=0x0710" /D "_UNICODE" /D "UNICODE" /GF /MT /GS- /fp:fast=2 /GR- /Yu"Stdphf.h" /Fp"Release\IccTestApp.pch" /Fo"Release/" /W5 /nologo /Wp64 /Zi /Gd /TP /Qdiag-disable:2012 /Qdiag-disable:2013 /Qdiag-disable:2014 /Qdiag-disable:2015 /Qdiag-disable:2017 /Qdiag-disable:2021 /Qdiag-disable:2022 /Qdiag-disable:2304 /U "_WIN32_MSC" /U "_WINCE_MSC" /U "WIN32_PLATFORM_PSPC" /U "WIN32_PLATFORM_WFSP" /U "WIN32_PLATFORM_WM50" /U "_WIN32_MGW" /U "_WIN32_BCC" /U "_COS16_TCC" /U "_WIN32_WCC" /Qopenmp /Qfp-speculation:fast /Qopt-matmul /Qparallel /Qstd=c++0x /Qrestrict /Qdiag-disable:111,673,10121
/Wport /Qeffc++ /QxSSE2 /Qansi-alias /Qvec-report=0 /Qfma /Qunroll:8 /Qunroll-aggressive /Qopt-streaming-stores:always /Qopt-block-factor:128 /Qopt-mem-layout-trans:2 /Wport /Qeffc++ /QxSSE2 /Qansi-alias /Qvec-report=0 /Qfma /Qunroll:8 /Qunroll-aggressive /Qopt-streaming-stores:always /Qopt-block-factor:128 /Qopt-mem-layout-trans:2

[ Linker ]
kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /OUT:"Release/IccTestApp.exe" /INCREMENTAL:NO /nologo /MANIFEST /MANIFESTFILE:"Release\IccTestApp.exe.intermediate.manifest" /NODEFAULTLIB:"../../Bin/Release/ScaLib.lib" /TLBID:1 /SUBSYSTEM:CONSOLE /STACK:268435456 /LARGEADDRESSAWARE /MACHINE:X86 /qdiag-disable:111,673,10121

[ Intel C++ compiler v13.1.0 ( u149 ) 64-bit ]

[ Compiler ]
/c /O3 /Ob1 /Oi /Ot /Qipo /I "..\..\Include" /I "C:\WorkLib\ICC2013\Composer XE 2013\ipp\include" /D "WIN32" /D "_CONSOLE" /D "NDEBUG" /D "_WIN32_ICC" /D "INTEL_SUITE_VERSION=PE130_149" /D "_IPP_PARALLEL_DYNAMIC" /D "IPP_USE_CUSTOM" /D "_VC80_UPGRADE=0x0710" /D "_UNICODE" /D "UNICODE" /GF /MT /GS- /arch:AVX /fp:fast=2 /GR- /Yu"Stdphf.h" /Fp"x64\Release\IccTestApp64.pch" /Fo"x64/Release/" /Fd"x64/Release/" /W5 /nologo /Wp64 /Zi /TP /U "_WIN32_MSC" /U "_WINCE_MSC" /U "WIN32_PLATFORM_PSPC" /U "WIN32_PLATFORM_WFSP" /U "WIN32_PLATFORM_WM50" /U "_WIN32_MGW" /U "_WIN32_BCC" /U "_COS16_TCC" /U "_WIN32_WCC" /Qopenmp /Qfp-speculation:fast /Qopt-matmul /Qstd=c++0x /Qrestrict /Qansi-alias /Qdiag-disable:111,673,2012,2015,2960,10121 /Wport /Qeffc++ /QxAVX /Qansi-alias /Qvec-report=0 /Qfma /Qunroll /Qunroll-aggressive /Qopt-streaming-stores:always /Qipp /Qipp-link:dynamic /Qmkl

[ Linker ]
kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /OUT:"x64\Release/IccTestApp64.exe" /INCREMENTAL:NO /nologo /LIBPATH:"C:\WorkLib\ICC2013\Composer XE 2013\ipp\lib\intel64" /LIBPATH:"C:\WorkLib\ICC2013\Composer XE 2013\compiler\lib\intel64" /MANIFEST /MANIFESTFILE:"x64\Release\IccTestApp64.exe.intermediate.manifest" /MANIFESTUAC:"level='asInvoker' uiAccess='false'" /NODEFAULTLIB:"../../Bin/Release/ScaLib64.lib" /TLBID:1 /SUBSYSTEM:CONSOLE /STACK:1000000000 /LARGEADDRESSAWARE /DYNAMICBASE /NXCOMPAT /MACHINE:X64 /qdiag-disable:111,673,2012,2015,2960,10121 /qdiag-sc-dir:"My Inspector XE Results - IccTestApp"

[ Watcom C++ compiler v2.0.0 32-bit ]

WccTestApp.cpp -5r -fp5 -fpi87 -wx -d0 -s -oabil+mprt -xd -D_WIN32_WCC -DNDEBUG -feWccTestApp.exe -k268435456 -i"C:\WorkLib\ICC2011\Compos~1\Mkl\Include" -"libpath C:\WorkLib\ICC2011\Compos~1\Mkl\Lib\Ia32Wcc" -wcd=007 -wcd=008 -wcd=013 -wcd=014 -wcd=086 -wcd=188 -wcd=367 -wcd=368 -wcd=369 -wcd=387 -wcd=389 -wcd=549 -wcd=601 -wcd=628 -wcd=689 -wcd=716 -wcd=725 -wcd=726 -wcd=735

[ Conclusion 2 ]

For the given Test Case the most efficient binary codes generation and fastest
initialization of a block of memory at run-time was done by Intel C++ compiler ( 32-bit and
64-bit versions ).

Legacy 32-bit C++ compilers from Borland and Watcom outperformed 32-bit Microsoft C++
compiler but it doesn't mean that in a more complex test they will be competitive because
modern C++ compilers have built-in support of SIMD technology and these two legacy C++
compilers don't.

The main speedup in memset, when it's applicable, is from the use of non-temporal/streaming stores.  It's usually a library call, not necessarily provided by the compiler.

[ Computer Systems used for performance evaluations ]

** Dell Precision Mobile M4700 **

Intel Core i7-3840QM ( 2.80 GHz )
Ivy Bridge / 4 cores / 8 logical CPUs / ark.intel.com/products/70846
32GB RAM
320GB HDD
NVIDIA Quadro K1000M ( 192 CUDA cores / 2GB memory )
Windows 7 Professional 64-bit SP1
Size of L3 Cache = 8MB ( shared between all cores for data & instructions )
Size of L2 Cache = 1MB ( 256KB per core / shared for data & instructions )
Size of L1 Cache = 256KB ( 32KB per core for data & 32KB per core for instructions )
Display resolution: 1366 x 768

** Dell Dimension 4400 **

Intel Pentium 4 ( 1.60 GHz / 1 core )
1GB RAM
Seagate 20GB HDD ( * )
Seagate 3TB HDD ( ** )
EVGA GeForce 6200 Video Card 512MB DDR2 AGP 8x Video Card
Windows XP Professional 32-bit SP3
Size of L2 Cache = 256KB
Size of L1 Cache = 8KB
Display resolution: 1440 x 990

( * ) Seagate Barracuda 20GB IDE Hard Disk Drive
ST320011A
3.5" 7200 Rpm 2MB Cache IDE Ultra ATA100 / ATA-iV/6
Average Rotational Latency : 4.17 ms
Average Seek Times Read : 9.0ms
Average Seek Times Write : 10.0ms
Maximum Internal Transfer Rate : 69.4MB/sec
Average External Transfer Rate : 100MB/sec ( Read and Write )
Maximum External Transfer Rate : 150MB/sec ( Read )
Note: Barracuda ATA IV Family

( ** ) Seagate Barracuda 3TB IDE Hard Disk Drive
ST3000DM001
3.5" 7200 Rpm 64MB Cache SATA III ( 6GB/sec )
Average Rotational Latency : 4.16 ms
Average Seek Times Read : 8.5ms
Average Seek Times Write : 9.5ms
Maximum Internal Transfer Rate : 268MB/sec
Average External Transfer Rate : 156MB/sec ( Read and Write )
Maximum External Transfer Rate : 210MB/sec ( Read )

Note: Non-Native to compiler inline function evaluated.

[ Performance Evaluation ( Release ) - Summary - 32-bit Windows XP SP3 ]

Microsoft C++ compiler ( VS2005 PE ) 32-bit

...
[ CrtMemset ] - Executed in 856 clock cycles
[ CrtMemset ] - Executed in 340 clock cycles
[ CrtMemset ] - Executed in 344 clock cycles
[ CrtMemset ] - Executed in 344 clock cycles
[ CrtMemset ] - Executed in 352 clock cycles
[ CrtMemset ] - Executed in 352 clock cycles
[ CrtMemset ] - Executed in 352 clock cycles
[ CrtMemset ] - Executed in 348 clock cycles
[ CrtMemset ] - Executed in 352 clock cycles
[ CrtMemset ] - Executed in 348 clock cycles
...

Intel C++ compiler v12.1.7 ( u371 ) 32-bit

...
[ CrtMemset ] - Executed in 476 clock cycles
[ CrtMemset ] - Executed in 444 clock cycles
[ CrtMemset ] - Executed in 488 clock cycles
[ CrtMemset ] - Executed in 352 clock cycles
[ CrtMemset ] - Executed in 292 clock cycles
[ CrtMemset ] - Executed in 350 clock cycles
[ CrtMemset ] - Executed in 292 clock cycles
[ CrtMemset ] - Executed in 280 clock cycles
[ CrtMemset ] - Executed in 300 clock cycles
[ CrtMemset ] - Executed in 440 clock cycles
...

[ Performance Evaluation ( Release ) - Final Results - 32-bit Windows XP SP3 ]

[ CRT-function - Results with 'memset' ]

Average is 088 clock cycles - Intel C++ compiler v12.1.7 ( u371 ) 32-bit
Average is 426 clock cycles - Microsoft C++ compiler ( VS2005 PE ) 32-bit

[ CRT-function - Results with 'memclr' - uses MOVNTPS ]

Average is 371 clock cycles - Intel C++ compiler v12.1.7 ( u371 ) 32-bit
Average is 399 clock cycles - Microsoft C++ compiler ( VS2005 PE ) 32-bit

Binary codes - Microsoft C++ compiler ( VS2005 PE ) 32-bit

Note: Since Non-Native to compiler function was used more binary codes are generated!

...
0024B6C3 rdtsc
0024B6C5 mov dword ptr [esp+18h], eax
0024B6C9 mov dword ptr [esp+1Ch], edx
0024B6CD mov eax, 48h
0024B6D2 lea edi, [esp+48h]
0024B6D6 call 00242850
0024B6DB rdtsc
...

[ 00242850 - 'memclr' ]
...
00242850 push ebp
00242851 mov ebp, esp
00242853 sub esp, 8
00242856 test edi, edi
00242858 je 002428B2
0024285A test eax, eax
0024285C jle 002428B2
0024285E mov ecx, eax
00242860 and ecx, 3Fh
00242863 mov edx, eax
00242865 sub edx, ecx
00242867 push esi
00242868 mov dword ptr [ebp-8], edi
0024286B mov dword ptr [ebp-4], edx
0024286E mov esi, dword ptr [ebp-8]
00242871 mov ecx, dword ptr [ebp-4]
00242874 shr ecx, 6
00242877 test ecx, ecx
00242879 je 00242899
0024287B prefetcht0 [esi]
0024287E xorpd xmm0, xmm0
00242882 movntps xmmword ptr [esi], xmm0
00242885 movntps xmmword ptr [esi+10h], xmm0
00242889 movntps xmmword ptr [esi+20h], xmm0
0024288D movntps xmmword ptr [esi+30h], xmm0
00242891 add esi, 40h
00242894 sub ecx, 1
00242897 jmp 00242877
00242899 sfence
0024289C cmp edx, eax
0024289E pop esi
0024289F jge 002428B2
002428A1 sub eax, edx
002428A3 push eax
002428A4 lea ecx, [edx+edi]
002428A7 push 0
002428A9 push ecx
002428AA call 0024CA40
002428AF add esp, 0Ch
002428B2 mov esp, ebp
002428B4 pop ebp
002428B5 ret
...

Binary codes - Intel C++ compiler v12.1.7 ( u371 ) 32-bit

Note: Since Non-Native to compiler function was used more binary codes are generated!

...
00401B0F rdtsc
00401B11 mov dword ptr [ebp-170h], eax
00401B17 lea ecx, [ebp-1C8h]
00401B1D mov dword ptr [ebp-16Ch], edx
00401B23 mov dword ptr [ebp-0A0h], ecx
00401B29 mov dword ptr [ebp-9Ch], 40h
00401B33 mov esi, dword ptr [ebp-0A0h]
00401B39 mov ecx, dword ptr [ebp-9Ch]
00401B3F shr ecx, 6
00401B42 test ecx, ecx
00401B44 je 00401B64
00401B46 prefetcht0 [esi]
00401B49 xorpd xmm0, xmm0
00401B4D movntps xmmword ptr [esi], xmm0
00401B50 movntps xmmword ptr [esi+10h], xmm0
00401B54 movntps xmmword ptr [esi+20h], xmm0
00401B58 movntps xmmword ptr [esi+30h], xmm0
00401B5C add esi, 40h
00401B5F sub ecx, 1
00401B62 jmp 00401B42
00401B64 sfence
00401B67 mov edx, dword ptr [ebp-9Ch]
00401B6D cmp edx, 48h
00401B70 jge 00401BC6
00401B72 mov ecx, edx
00401B74 neg ecx
00401B76 add ecx, 48h
00401B79 mov esi, ecx
00401B7B shr esi, 1Fh
00401B7E add esi, ecx
00401B80 sar esi, 1
00401B82 mov eax, dword ptr [ebp-0A0h]
00401B88 test esi, esi
00401B8A mov dword ptr [ebp-234h], ecx
00401B90 mov dword ptr [ebp-230h], esi
00401B96 jbe 00402E4B
00401B9C xor edi, edi
00401B9E lea esi, [edx+eax]
00401BA1 xor ecx, ecx
00401BA3 mov byte ptr [esi+edi*2], cl
00401BA6 mov byte ptr [esi+edi*2+1], cl
00401BAA inc edi
00401BAB cmp edi, dword ptr [ebp-230h]
00401BB1 jb 00401BA1
00401BB3 lea ecx, [edi+edi+1]
00401BB7 cmp ecx, dword ptr [ebp-234h]
00401BBD ja 00401BC6
00401BBF add edx, eax
00401BC1 mov byte ptr [ecx+edx-1], 0
00401BC6 rdtsc
...

Conclusion is as follows:

- For Microsoft C++ compiler ( VS2005 PE ) 32-bit codes with movntps instructions are ~6.3% faster ( 426 cc vs. 399 cc )

Note: 'cc' stands for clock cycles.

- For Intel C++ compiler v12.1.7 ( u371 ) 32-bit codes with movntps instructions are ~4.2x slower because more binary codes are generated ( 88 cc vs. 371 cc )

- It is obvious that older versions of C++ compilers did Not create binary codes with the most efficient
Non-Temporal instructions and, as we see, movntps was Not used in the first place to
prevent pollution of cache lines.

>>...The main speedup in memset, when it's applicable, is from the use of non-temporal/streaming stores...

Please take a look at my Conclusion. Thanks for the useful comment.

Unfortunately Intel Compiler will not vectorize STL containers like std::vector and boost::multi_array.

Here is disassembly of std::fill used by std::vector.

00007FF645FC2543 44 8B 0D F6 BF 86 00 mov         r9d,dword ptr [__libirc_largest_cache_size (07FF64682E540h)]  
00007FF645FC254A 4D 3B C1             cmp         r8,r9  
00007FF645FC254D 7F 5A                jg          __intel_memset+8E9h (07FF645FC25A9h)  
00007FF645FC254F EB 0F                jmp         __intel_memset+8A0h (07FF645FC2560h)  
00007FF645FC2551 66 66 66 66 66 66 66 0F 1F 84 00 00 00 00 00 nop         word ptr [rax+rax]  
00007FF645FC2560 4D 8D 40 80          lea         r8,[r8-80h]  
00007FF645FC2564 66 0F 7F 01          movdqa      xmmword ptr [rcx],xmm0  ; Looks here as if ICC is not-using  non-emporal stores.
00007FF645FC2568 66 0F 7F 41 10       movdqa      xmmword ptr [rcx+10h],xmm0  ; unrolling by 8
00007FF645FC256D 66 0F 7F 41 20       movdqa      xmmword ptr [rcx+20h],xmm0  
00007FF645FC2572 66 0F 7F 41 30       movdqa      xmmword ptr [rcx+30h],xmm0  
00007FF645FC2577 49 81 F8 80 00 00 00 cmp         r8,80h  
00007FF645FC257E 66 0F 7F 41 40       movdqa      xmmword ptr [rcx+40h],xmm0  
00007FF645FC2583 66 0F 7F 41 50       movdqa      xmmword ptr [rcx+50h],xmm0  
00007FF645FC2588 66 0F 7F 41 60       movdqa      xmmword ptr [rcx+60h],xmm0  
00007FF645FC258D 66 0F 7F 41 70       movdqa      xmmword ptr [rcx+70h],xmm0  
00007FF645FC2592 48 8D 89 80 00 00 00 lea         rcx,[rcx+80h]  
00007FF645FC2599 7D C5                jge         __intel_memset+8A0h (07FF645FC2560h)  
00007FF645FC259B 4C 8D 1D 5E 56 87 00 lea         r11,[nbody_implementation::NBodyDArray::`vftable'+3B50h (07FF646837C00h)]  
00007FF645FC25A2 49 03 C8             add         rcx,r8  
00007FF645FC25A5 43 FF 24 C3          jmp         qword ptr [r11+r8*8]  
00007FF645FC25A9 49 83 F9 00          cmp         r9,0  
00007FF645FC25AD 74 B1                je          __intel_memset+8A0h (07FF645FC2560h)  
00007FF645FC25AF EB 0F                jmp         __intel_memset+900h (07FF645FC25C0h)  
00007FF645FC25B1 66 66 66 66 66 66 66 0F 1F 84 00 00 00 00 00 nop         word ptr [rax+rax]  
00007FF645FC25C0 49 81 E8 80 00 00 00 sub         r8,80h  
00007FF645FC25C7 66 0F E7 01          movntdq     xmmword ptr [rcx],xmm0  
00007FF645FC25CB 66 0F E7 41 10       movntdq     xmmword ptr [rcx+10h],xmm0  
00007FF645FC25D0 66 0F E7 41 20       movntdq     xmmword ptr [rcx+20h],xmm0  
00007FF645FC25D5 66 0F E7 41 30       movntdq     xmmword ptr [rcx+30h],xmm0  
00007FF645FC25DA 66 0F E7 41 40       movntdq     xmmword ptr [rcx+40h],xmm0  
00007FF645FC25DF 66 0F E7 41 50       movntdq     xmmword ptr [rcx+50h],xmm0  
00007FF645FC25E4 66 0F E7 41 60       movntdq     xmmword ptr [rcx+60h],xmm0  
00007FF645FC25E9 66 0F E7 41 70       movntdq     xmmword ptr [rcx+70h],xmm0  
00007FF645FC25EE 48 81 C1 80 00 00 00 add         rcx,80h  
00007FF645FC25F5 49 81 F8 80 00 00 00 cmp         r8,80h  
00007FF645FC25FC 7D C2                jge         __intel_memset+900h (07FF645FC25C0h)  
00007FF645FC25FE 0F AE F8             sfence  
00007FF645FC2601 4C 8D 1D F8 55 87 00 lea         r11,[nbody_implementation::NBodyDArray::`vftable'+3B50h (07FF646837C00h)]  
00007FF645FC2608 49 03 C8             add         rcx,r8  
00007FF645FC260B 43 FF 24 C3          jmp         qword ptr [r11+r8*8]  
00007FF645FC260F 90                   nop  

 

>>...
>>...Unfortunately Intel Compiler will not vectorize STL containers like std::vector and boost::multi_array.

That is always a challenge for modern vectorizing C++ compilers but the core piece of processing looks good:
...
00007FF645FC25C0 49 81 E8 80 00 00 00 sub r8,80h
00007FF645FC25C7 66 0F E7 01 movntdq xmmword ptr [rcx],xmm0
00007FF645FC25CB 66 0F E7 41 10 movntdq xmmword ptr [rcx+10h],xmm0
00007FF645FC25D0 66 0F E7 41 20 movntdq xmmword ptr [rcx+20h],xmm0
00007FF645FC25D5 66 0F E7 41 30 movntdq xmmword ptr [rcx+30h],xmm0
00007FF645FC25DA 66 0F E7 41 40 movntdq xmmword ptr [rcx+40h],xmm0
00007FF645FC25DF 66 0F E7 41 50 movntdq xmmword ptr [rcx+50h],xmm0
00007FF645FC25E4 66 0F E7 41 60 movntdq xmmword ptr [rcx+60h],xmm0
00007FF645FC25E9 66 0F E7 41 70 movntdq xmmword ptr [rcx+70h],xmm0
00007FF645FC25EE 48 81 C1 80 00 00 00 add rcx,80h
00007FF645FC25F5 49 81 F8 80 00 00 00 cmp r8,80h
00007FF645FC25FC 7D C2 jge __intel_memset+900h (07FF645FC25C0h)
...

Pages

Leave a Comment

Please sign in to add a comment. Not a member? Join today