hi , i am optimize the compiler , and use more sse instruction , but i get result that sse instruction is slow than x87 instruction about 50%, can you have some commen!thanks.

hi , i am optimize the compiler , and use more sse instruction , but i get result that sse instruction is slow than x87 instruction about 50%, can you have some commen!thanks.

Hi,

my email:zhoudka@gmail.com

my msn:dk_zhou@hotmail.com

There
is two attached file . one is generate form gcc with optimize option
-O2 (nbody.s.1), one is modify from nbody.s.1 (nbody.s) , i use sse
instruction to take place the x87 instruction .

I use the standard x86 instruction and sse instruction to do calc , i
use gcc with -O2 optimize option to accomplish same function .

my code is 104 line in assemble , and gcc 's code is 105 lines in assemble .

my binary code is 356 byte , and gcc's code is 233 byte.

i call the function 10000000 times.

my code is slow than gcc's code about half time , why ?

can somebody give me some idea?

gcc's code:

00000008 :
8: d9 ee fldz
a: 55 push %ebp
b: 89 e5 mov %esp,%ebp
d: 83 ec 1c sub $0x1c,%esp
10: 57 push %edi
11: 56 push %esi
12: 53 push %ebx
13: 31 d2 xor %edx,%edx
15: 3b 55 0c cmp 0xc(%ebp),%edx
18: 0f 8d c5 00 00 00 jge e3
1e: 66 90 xchg %ax,%ax
20: 8b 4d 10 mov 0x10(%ebp),%ecx
23: 8d 04 d5 00 00 00 00 lea 0x0(,%edx,8),%eax
2a: 29 d0 sub %edx,%eax
2c: 8d 34 c1 lea (%ecx,%eax,8),%esi
2f: dd 46 18 fldl 0x18(%esi)
32: dd 46 20 fldl 0x20(%esi)
35: d9 c9 fxch %st(1)
37: d8 c8 fmul %st(0),%st
39: d9 c9 fxch %st(1)
3b: d8 c8 fmul %st(0),%st
3d: de c1 faddp %st,%st(1)
3f: dd 46 28 fldl 0x28(%esi)
42: d8 c8 fmul %st(0),%st
44: dd 46 30 fldl 0x30(%esi)
47: dc 0d 00 00 00 00 fmull 0x0
4d: d9 ca fxch %st(2)
4f: de c1 faddp %st,%st(1)
51: de c9 fmulp %st,%st(1)
53: 8b 45 0c mov 0xc(%ebp),%eax
56: 42 inc %edx
57: 89 55 fc mov %edx,-0x4(%ebp)
5a: de c1 faddp %st,%st(1)

5c: 39 c2 cmp %eax,%edx
5e: 7d 77 jge d7
60: 8b 7d 0c mov 0xc(%ebp),%edi
63: 89 d0 mov %edx,%eax
65: c1 e0 03 shl $0x3,%eax
68: 29 d0 sub %edx,%eax
6a: 8d 1c c1 lea (%ecx,%eax,8),%ebx
6d: 29 d7 sub %edx,%edi
6f: 90 nop
70: dd 06 fldl (%esi)
72: dd 46 08 fldl 0x8(%esi)
75: dd 46 10 fldl 0x10(%esi)
78: d9 ca fxch %st(2)
7a: dc 23 fsubl (%ebx)
7c: d9 c9 fxch %st(1)
7e: dc 63 08 fsubl 0x8(%ebx)
81: d9 ca fxch %st(2)
83: dc 63 10 fsubl 0x10(%ebx)
86: d9 c9 fxch %st(1)
88: d8 c8 fmul %st(0),%st
8a: d9 ca fxch %st(2)
8c: d8 c8 fmul %st(0),%st
8e: d9 c9 fxch %st(1)
90: d8 c8 fmul %st(0),%st
92: d9 ca fxch %st(2)
94: de c1 faddp %st,%st(1)
96: de c1 faddp %st,%st(1)
98: d9 c0 fld %st(0)
9a: d9 fa fsqrt
9c: dd e0 fucom %st(0)
9e: df e0 fnstsw %ax
a0: 80 e4 45 and $0x45,%ah
a3: 80 fc 40 cmp $0x40,%ah
a6: 74 1d je c5
a8: dd d8 fstp %st(0)
aa: 83 c4 f8 add $0xfffffff8,%esp
ad: 83 ec 08 sub $0x8,%esp
b0: dd 1c 24 fstpl (%esp)
b3: db 7d f0 fstpt -0x10(%ebp)
b6: e8 fc ff ff ff call b7
bb: 83 c4 10 add $0x10,%esp
be: db 6d f0 fldt -0x10(%ebp)
c1: d9 c9 fxch %st(1)
c3: eb 02 jmp c7

c5: dd d9 fstp %st(1)
c7: dd 46 30 fldl 0x30(%esi)
ca: dc 4b 30 fmull 0x30(%ebx)
cd: de f1 fdivp %st,%st(1)
cf: 83 c3 38 add $0x38,%ebx
d2: de e9 fsubrp %st,%st(1)
d4: 4f dec %edi
d5: 75 99 jne 70
d7: 8b 55 fc mov -0x4(%ebp),%edx
da: 3b 55 0c cmp 0xc(%ebp),%edx
dd: 0f 8c 3d ff ff ff jl 20
e3: 8b 45 08 mov 0x8(%ebp),%eax
e6: dd 18 fstpl (%eax)
e8: 8d 65 d8 lea -0x28(%ebp),%esp
eb: 5b pop %ebx
ec: 5e pop %esi
ed: 5f pop %edi
ee: 89 ec mov %ebp,%esp
f0: 5d pop %ebp
f1: c3 ret

my code:

00000008 :
8: 55 push %ebp
9: 89 e5 mov %esp,%ebp
b: 83 ec 40 sub $0x40,%esp
e: c7 45 d0 00 00 00 00 movl $0x0,-0x30(%ebp)
15: c7 45 c8 00 00 00 00 movl $0x0,-0x38(%ebp)
1c: c7 45 cc 00 00 00 00 movl $0x0,-0x34(%ebp)
23: bb 00 00 00 00 mov $0x0,%ebx
28: 66 0f 3a 22 fb 02 pinsrd $0x2,%ebx,%xmm7
2e: bb 00 00 00 00 mov $0x0,%ebx
33: 66 0f 3a 22 fb 03 pinsrd $0x3,%ebx,%xmm7
39: 8b 45 d0 mov -0x30(%ebp),%eax
3c: 39 45 0c cmp %eax,0xc(%ebp)
3f: 7f 0c jg 4d
41: 8b 45 08 mov 0x8(%ebp),%eax
44: 66 0f 17 38 movhpd %xmm7,(%eax)
48: e9 11 01 00 00 jmp 15e
4d: 8b 45 d0 mov -0x30(%ebp),%eax
50: 89 c1 mov %eax,%ecx
52: 6b c0 38 imul $0x38,%eax,%eax
55: 41 inc %ecx
56: 89 4d d4 mov %ecx,-0x2c(%ebp)
59: 03 45 10 add 0x10(%ebp),%eax
5c: 89 c1 mov %eax,%ecx
5e: 89 4d d8 mov %ecx,-0x28(%ebp)
61: f2 0f f0 70 28 lddqu 0x28(%eax),%xmm6
66: 0f 12 c6 movhlps %xmm6,%xmm0
69: f2 0f 59 05 38 00 00 mulsd 0x38,%xmm0
70: 00
71: f2 0f f0 68 18 lddqu 0x18(%eax),%xmm5
76: 66 0f 59 ed mulpd %xmm5,%xmm5
7a: f2 0f 59 f6 mulsd %xmm6,%xmm6
7e: 0f 12 cd movhlps %xmm5,%xmm1
81: f2 0f 58 e9 addsd %xmm1,%xmm5
85: f2 0f 58 ee addsd %xmm6,%xmm5
89: f2 0f 59 c5 mulsd %xmm5,%xmm0
8d: 0f 12 cf movhlps %xmm7,%xmm1

90: f2 0f 58 c1 addsd %xmm1,%xmm0
94: 66 0f 13 45 c8 movlpd %xmm0,-0x38(%ebp)
99: 0f 16 f8 movlhps %xmm0,%xmm7
9c: 8b 45 d4 mov -0x2c(%ebp),%eax
9f: 39 45 0c cmp %eax,0xc(%ebp)
a2: 7f 05 jg a9
a4: ff 45 d0 incl -0x30(%ebp)
a7: eb 90 jmp 39
a9: 66 0f 12 7d d4 movlpd -0x2c(%ebp),%xmm7
ae: 66 0f 3a 16 f8 00 pextrd $0x0,%xmm7,%eax
b4: 6b c0 38 imul $0x38,%eax,%eax
b7: 66 0f 3a 16 f9 00 pextrd $0x0,%xmm7,%ecx
bd: 41 inc %ecx
be: 89 4d d4 mov %ecx,-0x2c(%ebp)
c1: 03 45 10 add 0x10(%ebp),%eax
c4: 89 c1 mov %eax,%ecx
c6: 89 4d dc mov %ecx,-0x24(%ebp)
c9: 66 0f 3a 16 fa 01 pextrd $0x1,%xmm7,%edx
cf: 66 0f 12 02 movlpd (%edx),%xmm0
d3: 66 0f 3a 16 fb 01 pextrd $0x1,%xmm7,%ebx
d9: 66 0f 16 43 08 movhpd 0x8(%ebx),%xmm0
de: f2 0f f0 30 lddqu (%eax),%xmm6
e2: 66 0f 5c c6 subpd %xmm6,%xmm0
e6: 66 0f 12 48 30 movlpd 0x30(%eax),%xmm1
eb: 0f 16 c8 movlhps %xmm0,%xmm1
ee: 66 0f 3a 16 fb 01 pextrd $0x1,%xmm7,%ebx
f4: 66 0f 12 53 30 movlpd 0x30(%ebx),%xmm2
f9: 0f 16 d0 movlhps %xmm0,%xmm2
fc: 66 0f 59 ca mulpd %xmm2,%xmm1
100: 66 0f 13 45 e0 movlpd %xmm0,-0x20(%ebp)
105: 66 0f 3a 16 fa 01 pextrd $0x1,%xmm7,%edx
10b: 66 0f 12 52 10 movlpd 0x10(%edx),%xmm2
110: f2 0f 5c 50 10 subsd 0x10(%eax),%xmm2
115: 66 0f 3a 0d c2 01 blendpd $0x1,%xmm2,%xmm0
11b: 66 0f 3a 0d d0 02 blendpd $0x2,%xmm0,%xmm2
121: 66 0f 59 c2 mulpd %xmm2,%xmm0
125: 66 0f 17 55 e8 movhpd %xmm2,-0x18(%ebp)
12a: 66 0f 13 55 f0 movlpd %xmm2,-0x10(%ebp)
12f: 0f 12 d1 movhlps %xmm1,%xmm2
132: 0f 12 d8 movhlps %xmm0,%xmm3
135: f2 0f 58 d3 addsd %xmm3,%xmm2
139: f2 0f 58 d0 addsd %xmm0,%xmm2
13d: f2 0f 51 c2 sqrtsd %xmm2,%xmm0
141: f2 0f 5e c8 divsd %xmm0,%xmm1
145: 66 0f 13 45 f8 movlpd %xmm0,-0x8(%ebp)

14a: 0f 12 c7 movhlps %xmm7,%xmm0
14d: f2 0f 5c c1 subsd %xmm1,%xmm0
151: 66 0f 13 45 c8 movlpd %xmm0,-0x38(%ebp)
156: 0f 16 f8 movlhps %xmm0,%xmm7
159: e9 3e ff ff ff jmp 9c
15e: 89 ec mov %ebp,%esp
160: 5d pop %ebp
161: c3 ret

my code will modify esi,edi,ebx,so if you want to call the function ,you had to save it!

my code will use a global constant

LDouble0$inline:
.long 0x0,0x3fe00000

two attached file

nbody.s.1 is generate from gcc -O2 -S nbody.c -o nbody.s.1

nbody.s is modify from nbody.s.1 , use sse instruction to take place of x87 instruction.

2 posts / novo 0
Último post
Para obter mais informações sobre otimizações de compiladores, consulte Aviso sobre otimizações.

1. FPU code is pipelined.

2. Your SSE code does not compute any values in parallel (you can do two divisions in parallel with DIVPD, two square roots with SQRTPD, etc).

3. Your SSE code has many partial register accesses which should be avoided.

4. Your SSE code seems like it is doing many unnecessary operations (such as zeroing EBX register before each PINSRD), and I am not even sure why would you do those PINSRD either when you could just XORPD'd the XMM7 register to initialize it to 0 beforehand.

Finally, I am not even sure if your assembler code can produce correct result. Did you test the accuracy of your code?

I suggest you post your original C code so we can start optimizing from there.

--
Regards,
Igor Levicki

If you find my post helpfull, please rate it and/or select it as a best answer where applies. Thank you.

Deixar um comentário

Faça login para adicionar um comentário. Não é membro? Inscreva-se hoje mesmo!