Why do 64-bit mode integer division is less performant on core i7 than in 32-bit mode?
During participating in Intel contest I've discovered that optimising code for 64 bit mode not always gives us perfomance gain. More precisely - it depends on the chosen processor. Particularly - the new-generation core i7 processor has performance degrade in such case.
Here is the code sample.
32-bit include:
#define _eax eax
#define _ebx ebx
#define _ecx ecx
#define _edx edx
#define _esi esi
#define _edi edi
#define SCALE 1
#define PTR dword ptr
64-bit include:
#define _eax rax
#define _ebx rbx
#define _ecx rcx
#define _edx rdx
#define _esi rsi
#define _edi rdi
#define SCALE 2
common include:
#define BITS (32 * SCALE)
#define BYTES (4 * SCALE)
#define BYTES2 (8 * SCALE)
problematic code:
asm {
$loopX:
xor _eax, _eax
div _ebx
add PTR [_esi], _eax
adc PTR [_esi - BYTES], 0
add esi, BYTES2
xor _eax, _eax
div _ebx
add PTR [_esi], _eax
adc PTR [_esi - BYTES], 0
add esi, BYTES2
xor _eax, _eax
div _ebx
add PTR [_esi], _eax
adc PTR [_esi - BYTES], 0
add esi, BYTES2
xor _eax, _eax
div _ebx
add PTR [_esi], _eax
adc PTR [_esi - BYTES], 0
add esi, BYTES2
xor _eax, _eax
div _ebx
add PTR [_esi], _eax
adc PTR [_esi - BYTES], 0
add esi, BYTES2
xor _eax, _eax
div _ebx
add PTR [_esi], _eax
adc PTR [_esi - BYTES], 0
add esi, BYTES2
xor _eax, _eax
div _ebx
add PTR [_esi], _eax
adc PTR [_esi - BYTES], 0
add esi, BYTES2
xor _eax, _eax
div _ebx
add PTR [_esi], _eax
adc PTR [_esi - BYTES], 0
add esi, BYTES2
dec _ecx
jne $loopX
}
Thanks. |