icc18 vectorizer unnecessarily ands with 0x00ff after shifting word right by 8

icc18 vectorizer unnecessarily ands with 0x00ff after shifting word right by 8

This code

#include <stdint.h>

int const count = 1024;
uint8_t p[count];

void mul(uint16_t m)
{
#pragma simd vectorlength(8)
	for (int i = 0; i < count; ++i)
		p[i] = uint16_t(p[i] * m) >> 8;
}

compiled with -Os​ generates

mul(unsigned short):
  xor edx, edx #9.2
  movzx eax, di #10.26
  pxor xmm2, xmm2 #10.19
  movdqu xmm1, XMMWORD PTR .L_2il0floatpacket.0[rip] #10.32
  movd xmm3, eax #10.26
  punpcklwd xmm3, xmm3 #10.26
  punpckldq xmm3, xmm3 #10.26
  punpcklqdq xmm3, xmm3 #10.26
..B1.2: # Preds ..B1.2 ..B1.1
  lea rax, QWORD PTR [p+rdx] #10.19
  movq xmm0, QWORD PTR [rax] #10.19
  punpcklbw xmm0, xmm2 #10.19
  pmullw xmm0, xmm3 #10.32
  psrlw xmm0, 8 #10.32
  pand xmm0, xmm1 #10.32
  packuswb xmm0, xmm2 #10.32
  movq QWORD PTR [rax], xmm0 #10.3
  add rdx, 8 #9.2
  cmp rdx, 1024 #9.2
  jb ..B1.2 # Prob 99% #9.2
  ret #11.1
p:
.L_2il0floatpacket.0:
  .long 0x00ff00ff,0x00ff00ff,0x00ff00ff,0x00ff00ff

with a redundant pand instruction.

3 posts / 0 new
Last post
For more complete information about compiler optimizations, see our Optimization Notice.

Is it the same if you shift by 8 cast to same type?

Makes no difference: https://godbolt.org/g/pzbiKu

Leave a Comment

Please sign in to add a comment. Not a member? Join today