| Thread Tools | Search this thread |
|---|
srimks
| January 21, 2009 1:10 AM PST Why "subq" as allocate by ICC-v10.0 but not as prologue, but ICC-v11.0 uses "pushq" as prologue? | ||||
Hi All. Below is a piece of CPP code and behaviour of asm due to ICC-v11.0 & ICC-v10.0 - -- #include <iostream> #include <dvec.h> #define MAX 1024 int main() { int i, j; int num[MAX], isort[MAX], cluster[MAX][MAX]; for (j = 0; j < MAX; j++) { num[j] = 0; isort[j] = j; for (i = 0; i < MAX; i++) { cluster[j][i] = 0; } } printf("%d %d %d\n",num[64],isort[78],cluster[384][74]); return 0; } -- whose asm using ICC-v11.0 has been created with command - $ icpc -fno-builtin test.cpp -S test.cpp(14): (col. 6) remark: LOOP WAS VECTORIZED. as --- # -- Machine type EFI2 # mark_description "Intel(R) C++ Compiler Professional for applications running on Intel(R) 64, Version 11.0 Build 20081105 %"; # mark_description "s"; # mark_description "-fno-builtin -S"; .file "test.cpp" .section .ctors, "wa" .text ..TXTST0: # -- Begin main # mark_begin; .align 16,0x90 .globl main main: ..B1.1: # Preds ..B1.0 ..___tag_value_main.1: #7.1 pushq %rbp #7.1 //save frame pointer ..___tag_value_main.2: # movq %rsp, %rbp #7.1 //set new FP ..___tag_value_main.3: # andq $-128, %rsp #7.1 subq $4202496, %rsp #7.1 //allocate stack space movl $3, %edi #7.1 ..___tag_value_main.5: #7.1 call __intel_new_proc_init #7.1 ..___tag_value_main.6: # # LOE rbx r12 r13 r14 r15 ..B1.9: # Preds ..B1.1 stmxcsr 4194304(%rsp) #7.1 orl $32832, 4194304(%rsp) #7.1 ldmxcsr 4194304(%rsp) #7.1 lea (%rsp), %rdx #15.11 xorl %esi, %esi #11.2 xorl %ecx, %ecx # movq %rdx, %rax # pxor %xmm0, %xmm0 #15.27 # LOE rax rdx rbx rsi r12 r13 r14 r15 ecx xmm0 ..B1.2: # Preds ..B1.4 ..B1.9 movl %ecx, 4198400(%rsp,%rsi,4) #13.11 movl $0, 4194304(%rsp,%rsi,4) #12.11 xorl %r8d, %r8d #14.6 movq %rax, %rdi #12.11 # LOE rax rdx rbx rsi rdi r8 r12 r13 r14 r15 ecx xmm0 ..B1.3: # Preds ..B1.3 ..B1.2 movdqa %xmm0, (%rdx,%r8,4) #15.11 movdqa %xmm0, 16(%rdi) #15.11 movdqa %xmm0, 32(%rdi) #15.11 movdqa %xmm0, 48(%rdi) #15.11 addq $64, %rdi #14.6 addq $16, %r8 #14.6 cmpq $1024, %r8 #14.6 jl ..B1.3 # Prob 99% #14.6 # LOE rax rdx rbx rsi rdi r8 r12 r13 r14 r15 ecx xmm0 ..B1.4: # Preds ..B1.3 addq $4096, %rax #11.2 addq $4096, %rdx #11.2 incl %ecx #11.2 incq %rsi #11.2 cmpq $1024, %rsi #11.2 jl ..B1.2 # Prob 99% #11.2 # LOE rax rdx rbx rsi r12 r13 r14 r15 ecx xmm0 ..B1.5: # Preds ..B1.4 movl 4194560(%rsp), %esi #18.2 movl 4198712(%rsp), %edx #18.2 movl 1573160(%rsp), %ecx #18.2 movl $_2__STRING.0.0, %edi #18.2 xorl %eax, %eax #18.2 ..___tag_value_main.7: #18.2 call printf #18.2 ..___tag_value_main.8: # # LOE rbx r12 r13 r14 r15 ..B1.6: # Preds ..B1.5 xorl %eax, %eax #19.9 movq %rbp, %rsp #19.9 popq %rbp #19.9 ..___tag_value_main.9: # ret #19.9 .align 16,0x90 ..___tag_value_main.11: # # LOE # mark_end; .type main,@function .size main,.-main .data # -- End main .text # -- Begin __sti__$E # mark_begin; .align 16,0x90 __sti__$E: ..B2.1: # Preds ..B2.0 ..___tag_value___sti__$E.12: # pushq %rsi # ..___tag_value___sti__$E.14: # movl $_ZSt8__ioinit.0, %edi #77.25 ..___tag_value___sti__$E.15: #77.25 call _ZNSt8ios_base4InitC1Ev #77.25 ..___tag_value___sti__$E.16: # # LOE rbx rbp r12 r13 r14 r15 ..B2.2: # Preds ..B2.1 movl $_ZNSt8ios_base4InitD1Ev, %edi #77.25 .... .... --- and the asm using ICC-v10.0 has been created as below - $ /opt/intel/cce/10.0.023/bin/icpc test.cpp -S test.cpp(14): (col. 6) remark: LOOP WAS VECTORIZED. as -- # -- Machine type EFI2 # mark_description "Intel(R) C++ Compiler for applications running on Intel(R) 64, Version 10.0 Build 20070426 %s"; # mark_description "-S"; .file "test.cpp" .section .ctors, "wa" .text ..TXTST0: # -- Begin main # mark_begin; .align 2,0x90 .globl main main: ..B1.1: # Preds ..B1.0 ..___tag_value_main.1: #7.1 subq $4202504, %rsp #7.1 // Query- Why allocate for stack space has been used for subq here but prologue is not being called? ..___tag_value_main.9: # movl $3, %edi #7.1 ..___tag_value_main.10: #7.1 call __intel_new_proc_init #7.1 ..___tag_value_main.11: # # LOE rbx rbp r12 r13 r14 r15 ..B1.9: # Preds ..B1.1 stmxcsr (%rsp) #7.1 orl $32832, (%rsp) #7.1 ldmxcsr (%rsp) #7.1 xorl %esi, %esi # xorl %ecx, %ecx # xorl %edx, %edx # movl $4096, %eax # pxor %xmm0, %xmm0 #15.27 # LOE rax rcx rbx rbp rsi r12 r13 r14 r15 edx xmm0 ..B1.2: # Preds ..B1.9 ..B1.4 movl %edx, 4096(%rsp,%rsi) #13.11 movl $0, (%rsp,%rsi) #12.11 movq %rcx, %rdi #12.11 # LOE rax rcx rbx rbp rsi rdi r12 r13 r14 r15 edx xmm0 ..B1.3: # Preds ..B1.3 ..B1.2 movdqa %xmm0, 8192(%rsp,%rdi) #15.11 movdqa %xmm0, 8208(%rsp,%rdi) #15.11 movdqa %xmm0, 8224(%rsp,%rdi) #15.11 movdqa %xmm0, 8240(%rsp,%rdi) #15.11 movdqa %xmm0, 8256(%rsp,%rdi) #15.11 movdqa %xmm0, 8272(%rsp,%rdi) #15.11 movdqa %xmm0, 8288(%rsp,%rdi) #15.11 movdqa %xmm0, 8304(%rsp,%rdi) #15.11 addq $128, %rdi #14.6 cmpq %rax, %rdi #14.6 jl ..B1.3 # Prob 99% #14.6 # LOE rax rcx rbx rbp rsi rdi r12 r13 r14 r15 edx xmm0 ..B1.4: # Preds ..B1.3 addq $4, %rsi #11.2 addq $4096, %rax #11.2 addq $4096, %rcx #11.2 addl $1, %edx #11.2 cmpq $4198400, %rax #11.2 jl ..B1.2 # Prob 99% #11.2 # LOE rax rcx rbx rbp rsi r12 r13 r14 r15 edx xmm0 ..B1.5: # Preds ..B1.4 movl 256(%rsp), %esi #18.2 movl 4408(%rsp), %edx #18.2 movl 1581352(%rsp), %ecx #18.2 movl $_2__STRING.0.0, %edi #18.2 xorl %eax, %eax #18.2 ..___tag_value_main.12: #18.2 call printf #18.2 ..___tag_value_main.13: # # LOE rbx rbp r12 r13 r14 r15 ..B1.6: # Preds ..B1.5 xorl %eax, %eax #19.9 addq $4202504, %rsp #19.9 ..___tag_value_main.14: # ret #19.9 .align 2,0x90 ..___tag_value_main.15: # # LOE # mark_end; .type main,@function .size main,.-main .data # -- End main .text # -- Begin __sti__$E # mark_begin; .align 2,0x90 __sti__$E: ..B2.1: # Preds ..B2.0 ..___tag_value___sti__$E.16: # pushq %rsi # ..___tag_value___sti__$E.24: # movl $_ZSt8__ioinit.0, %edi #77.25 ..___tag_value___sti__$E.25: #77.25 call _ZNSt8ios_base4InitC1Ev #77.25 ..___tag_value___sti__$E.26: # # LOE rbx rbp r12 r13 r14 r15 ..B2.2: # Preds ..B2.1 movl $_ZNSt8ios_base4InitD1Ev, %edi #77.25 movl $_ZSt8__ioinit.0, %esi #77.25 ... ... -- Query: (a) Why with ICC-v10.0, the allocate for main starts with subq operand or why pushq prologue is not observed? (b) Does ICC-v11.0 has any specific reasons of using prologue for main rather allocate which has been used by ICC-v10.0? (c) The size for above am has been 9120 & 8792 respectively. Use of "fno-builtin" with ICC-v11.0 shouldn't bloat the asm size? (d) ICC-v10.0 asm results with "pushq %rsi", means register "rsi" used tp pass an arguement to function but not with "rbp" but ICC-v11.0 "pushq" results with both "rbp", a callee-saved register and ""rsi" used to pass arguement to function, any insights w.r.t performances? (e) If the same above code is executed with ICC-v11.0 using "-fno-builtin", allocate of main starts with "subq" but w/o "fno-builtin" it starts with "pushq", also use of "fno-builtin" generates a bigger size asm and contains "MOVNTDQ" inst. Normally, use of MOVNTDQ speeds the code but here it becomes opposite, why such weird behaviour? ~BR | |||||
|
|||||||||||||
|
|||||||||||||
|
|||||||||||||
| 8470 users have contributed to 31601 threads and 100646 posts to date. |
|---|
| In the past 24 hours, we have 30 new thread(s) 113 new posts(s), and 159 new user(s). In the past 3 days, the most popular thread for everyone has been gemm(A,A,A) like possible? The most posts were made to gemm(A,A,A) like possible? The post with the most views is Dear Steve, excuse me for a d Please welcome our newest member kopernikus |