Vec BKM Utilize full-vectors by Document9800.00000000000

Vec BKM 使用全向量Document9800.00000000000

vec-loop trip-count 不是 vec-length长度的倍数时剩余循环执行剩余的迭代。尽管这在许多时候是无法避免的，在剩余循环上花费大量的时间会降低性能和效率。例如，如果 vec-loop trip-count 20 vec-length 16，则意味着内核循环每次执行时，剩余的 4 次迭代必须在剩余循环中执行。尽管 KNC 编译器能够对剩余循环进行向量化处理（正如 -vec-report6 所报告的），但是其效率却不如内核循环。例如，剩余循环将使用屏蔽，而且不得不使用收集/分散，而不是单位步长加载/存储（由于内存故障保护问题）。解决此问题的最好方法是以下这种方式重构算法/代码，即剩余循环不在运行时执行（使 trip-count 成为 vec-length 的倍数来实现），或者使 trip-count 大于 vec-length（这样便可降低剩余循环中的执行操作的开销）。

% cat -n t2.c

1  #include <stdio.h>

2

3  void foo1(float *a, float *b, float *c, int n)

4  {

5    int i;

6  #pragma ivdep

7    for (i=0; i<n; i++) {

8      a[i] *= b[i] + c[i];

9    }

10  }

11

12  void foo2(float *a, float *b, float *c, int n)

13  {

14    int i;

15  #pragma ivdep

16    for (i=0; i<20; i++) {

17      a[i] *= b[i] - c[i];

18    }

19  }

20

% icc -O2 -vec-report6 t2.c -c -mmic -inline-level=0
t2.c(8):
(col. 5) remark: vectorization support: reference a has aligned access.
t2.c(8): (col. 5) remark: vectorization support: reference a has aligned access.
t2.c(8): (col. 5) remark: vectorization support: reference b has unaligned access.
t2.c(8): (col. 5) remark: vectorization support: reference c has aligned access.
t2.c(8): (col. 5) remark: vectorization support: unaligned access used inside loop body.
t2.c(7): (col. 3) remark: vectorization support: unroll factor set to 2.
t2.c(7): (col. 3) remark: LOOP WAS VECTORIZED.
t2.c(8): (col. 5) remark: vectorization support: reference a has unaligned access.
t2.c(8): (col. 5) remark: vectorization support: reference a has unaligned access.
t2.c(8): (col. 5) remark: vectorization support: reference b has unaligned access.
t2.c(8): (col. 5) remark: vectorization support: reference c has unaligned access.
t2.c(8): (col. 5) remark: vectorization support: unaligned access used inside loop body.
t2.c(7): (col. 3) remark: PEEL LOOP WAS VECTORIZED.
t2.c(8): (col. 5) remark: vectorization support: reference a has aligned access.
t2.c(8): (col. 5) remark: vectorization support: reference a has aligned access.
t2.c(8): (col. 5) remark: vectorization support: reference b has unaligned access.
t2.c(8): (col. 5) remark: vectorization support: reference c has unaligned access.
t2.c(8): (col. 5) remark: vectorization support: reference a has aligned access.
t2.c(8): (col. 5) remark: vectorization support: reference a has aligned access.
t2.c(8): (col. 5) remark: vectorization support: reference b has unaligned access.
t2.c(8): (col. 5) remark: vectorization support: reference c has unaligned access.
t2.c(8): (col. 5) remark: vectorization support: unaligned access used inside loop body.
t2.c(7): (col. 3) remark: REMAINDER LOOP WAS VECTORIZED.
t2.c(17): (col. 5) remark: vectorization support: reference a has unaligned access.
t2.c(17): (col. 5) remark: vectorization support: reference a has unaligned access.
t2.c(17): (col. 5) remark: vectorization support: reference b has unaligned access.
t2.c(17): (col. 5) remark: vectorization support: reference c has unaligned access.
t2.c(17): (col. 5) remark: vectorization support: unaligned access used inside loop body.
t2.c(16): (col. 3) remark: LOOP WAS VECTORIZED.
t2.c(17): (col. 5) remark: vectorization support: reference a has unaligned access.
t2.c(17): (col. 5) remark: vectorization support: reference a has unaligned access.
t2.c(17): (col. 5) remark: vectorization support: reference b has unaligned access.
t2.c(17): (col. 5) remark: vectorization support: reference c has unaligned access.
t2.c(17): (col. 5) remark: vectorization support: unaligned access used inside loop body.
t2.c(16): (col. 3) remark: loop was not vectorized: vectorization possible but seems inefficient.

1. 此选项可在编译器针对向量剩余循环和向量剥离循环生成的序列中提供帮助。该选项可提高此类循环中内存操作的性能。

ptr = (float *)malloc(sizeof(float) * n);

ptr = (float *)malloc(sizeof(float) * n + 64);

..B2.7:                         # Preds ..B2.9 ..B2.6 Latency 13

vpcmpgtd  %zmm0, %zmm2, %k0                             #7.3 c1

nop                                                     #7.3 c5

knot      %k0, %k1                                      #7.3 c9

jkzd      ..B2.9, %k1   # Prob 20%                      #7.3 c13

# LOE rdx rbx rbp rsi rdi r9 r10 r11 r12 r13 r14

r15 eax ecx r8d zmm0 zmm1 zmm2 zmm3 k1

..B2.8:                         # Preds ..B2.7 Latency 53

vmovaps   %zmm1, %zmm4                                  #8.13 c1

vmovaps   %zmm1, %zmm5                                  #8.20 c5

vmovaps   %zmm1, %zmm6                                  #8.5 c9

vaddps    %zmm5, %zmm4, %zmm7                           #8.20 c37

vmulps    %zmm7, %zmm6, %zmm8                           #8.5 c41

nop                                                     #8.5 c45

vpackstorelps %zmm8, (%rdi,%r10,4){%k1}                 #8.5 c49

vpackstorehps %zmm8, 64(%rdi,%r10,4){%k1}               #8.5 c53

movb      %al, %al                                      #8.5 c53

# LOE rdx rbx rbp rsi rdi r9 r10 r11 r12 r13 r14

r15 eax ecx r8d zmm0 zmm1 zmm2 zmm3

..B2.9:                         # Preds ..B2.7 ..B2.8 Latency 9

vpaddd    %zmm3, %zmm2, %zmm2                           #7.3 c5

cmpq      %r11, %r10                                    #7.3 c5

jb        ..B2.7        # Prob 82%                      #7.3 c9

..B2.7:                         # Preds ..B2.9 ..B2.6 Latency 13

vpcmpgtd  %zmm0, %zmm2, %k0                             #7.3 c1

nop                                                     #7.3 c5

knot      %k0, %k4                                      #7.3 c9

jkzd      ..B2.9, %k4   # Prob 20%                      #7.3 c13

# LOE rax rdx rbx rbp rsi rdi r9 r11 r13 r15 ecx

r8d r10d zmm0 zmm1 zmm2 zmm3 k4

..B2.8:                         # Preds ..B2.7 Latency 57

vmovaps   .L_2il0floatpacket.10(%rip), %zmm8            #8.5 c1

vmovaps   %zmm1, %zmm4                                  #8.13 c5

lea       (%rsi,%r13), %r14                             #8.13 c5

vmovaps   %zmm1, %zmm5                                  #8.20 c9

kmov      %k4, %k2                                      #8.13 c9

..L15:                                                          #8.13

vgatherdps (%r14,%zmm8,4), %zmm4{%k2}                   #8.13

jkzd      ..L14, %k2    # Prob 50%                      #8.13

vgatherdps (%r14,%zmm8,4), %zmm4{%k2}                   #8.13

jknzd     ..L15, %k2    # Prob 50%                      #8.13

..L14:                                                          #

vmovaps   %zmm1, %zmm6                                  #8.5 c21

kmov      %k4, %k3                                      #8.20 c21

lea       (%rdx,%r13), %r14                             #8.20 c25

lea       (%rdi,%r13), %r12                             #8.5 c25

..L17:                                                          #8.20

vgatherdps (%r14,%zmm8,4), %zmm5{%k3}                   #8.20

jkzd      ..L16, %k3    # Prob 50%                      #8.20

vgatherdps (%r14,%zmm8,4), %zmm5{%k3}                   #8.20

jknzd     ..L17, %k3    # Prob 50%                      #8.20

..L16:                                                          #

vaddps    %zmm5, %zmm4, %zmm7                           #8.20 c37

kmov      %k4, %k1                                      #8.5 c37

..L19:                                                          #8.5

vgatherdps (%r12,%zmm8,4), %zmm6{%k1}                   #8.5

jkzd      ..L18, %k1    # Prob 50%                      #8.5

vgatherdps (%r12,%zmm8,4), %zmm6{%k1}                   #8.5

jknzd     ..L19, %k1    # Prob 50%                      #8.5

..L18:                                                          #

vmulps    %zmm7, %zmm6, %zmm9                           #8.5 c49

nop                                                     #8.5 c53

..L21:                                                          #8.5

vscatterdps %zmm9, (%r12,%zmm8,4){%k4}                  #8.5

jkzd      ..L20, %k4    # Prob 50%                      #8.5

vscatterdps %zmm9, (%r12,%zmm8,4){%k4}                  #8.5

jknzd     ..L21, %k4    # Prob 50%                      #8.5

..L20:                                                          #

# LOE rax rdx rbx rbp rsi rdi r9 r11 r13 r15 ecx

r8d r10d zmm0 zmm1 zmm2 zmm3

..B2.9:                         # Preds ..B2.7 ..B2.8 Latency 9

vpaddd    %zmm3, %zmm2, %zmm2                           #7.3 c5

cmpq      %r9, %rax                                     #7.3 c5

jb        ..B2.7        # Prob 82%                      #7.3 c9

void foo(short * restrict a, short *restrict b, short * restrict c)

{

int i;

for(i = 0; i < N; i++) {

a[i] = b[i] + c[i];

}

}

..B1.6:

lea       (%rax,%rsi), %r10

andq      \$63, %r10

cmpq      \$32, %r10

jle       ..L3

..L3:

vprefetch1 256(%rax,%rsi)

lea       (%rax,%rdx), %r10

andq      \$63, %r10

cmpq      \$32, %r10

jle       ..L4

..L4:

vprefetch0 128(%rax,%rsi)

vprefetch1 256(%rax,%rdx)

vpandd    %zmm0, %zmm3, %zmm4

vprefetch0 128(%rax,%rdx)

vprefetch1 256(%rax,%rdi)

lea       (%rax,%rdi), %r10

andq      \$63, %r10

cmpq      \$32, %r10

jle       ..L5

vpackstorehd %zmm4{uint16}, 64(%rax,%rdi)

..L5:

vpackstoreld %zmm4{uint16}, (%rax,%rdi)

vprefetch0 128(%rax,%rdi)

cmpq      \$992, %rcx

jb        ..B1.6

..L9:

vpgatherdd 1984(%rdx,%zmm3,2){sint16}, %zmm1{%k2}

jkzd      ..L8, %k2

vpgatherdd 1984(%rdx,%zmm3,2){sint16}, %zmm1{%k2}

jknzd     ..L9, %k2

..L8:

vpandd    .L_2il0floatpacket.3(%rip), %zmm2, %zmm4

nop

..L11:

vpscatterdd %zmm4{uint16}, 1984(%rdi,%zmm3,2){%k3}

jkzd      ..L10, %k3

vpscatterdd %zmm4{uint16}, 1984(%rdi,%zmm3,2){%k3}

jknzd     ..L11, %k3

..L10:

..B1.6:

vprefetch1 256(%rax,%rsi)

vprefetch0 128(%rax,%rsi)

vprefetch1 256(%rax,%rdx)

vprefetch0 128(%rax,%rdx)

vprefetch1 256(%rax,%rdi)

vpandd    %zmm0, %zmm3, %zmm4

vprefetch0 128(%rax,%rdi)

movb      %dl, %dl

vpackstoreld %zmm4{uint16}, (%rax,%rdi)

vpackstorehd %zmm4{uint16}, 64(%rax,%rdi)

cmpq      \$992, %rcx

jb        ..B1.6

vpandd    .L_2il0floatpacket.3(%rip), %zmm2, %zmm3

nop

vpackstoreld %zmm3{uint16}, 1984(%rdi){%k1}

vpackstorehd %zmm3{uint16}, 2048(%rdi){%k1}

movb      %al, %al

Categorias:
Para obter mais informações sobre otimizações de compiladores, consulte Aviso sobre otimizações.