Using AVX opcodes slow my proc

Using AVX opcodes slow my proc

Strange, but AVX version this code slower XMM version in 20 times.
CPU i7-6950X 3.0GHz
Working on Win10/X64. Is any idea how run this code properly?
IDA Pro dissasembly shows what code right. No errors.

procedure ScanLineVec256(X0, X1: Integer; P: TVertexD);
asm
  .NOFRAME

  sub X1, X0
  inc X1

  movq2dq xmm3, mm6
  movd xmm11, r12d
  shufps xmm11, xmm11, 00b

  //vmovdqu ymm10, [R8]
  db $C4, $41, $7E, $6F, $10

 @X:

  add r10, 4
  add r11, 4

  //vaddpd ymm10, ymm10, ymm13
  db $C4, $41, $2D, $58, $D5

  //vandpd ymm0, ymm10, ymm15

  db $C4, $C1, $2D, $54, $C7
  //vxorpd ymm0, ymm0, ymm15
  db $C4, $C1, $7D, $57, $C7
  //vptest ymm0, ymm0
  db $C4, $E2, $7D, $17, $C0

  jz @Inside
  dec X1
  jnz @X
  ret

  @Inside:

  //vmovdqa ymm1, ymm12
  db $C5, $7D, $7F, $E1

  //vmulpd ymm1, ymm1, ymm10
  db $C4, $C1, $75, $59, $CA

  //vmovdqa ymm4, ymm1
  db $C5, $FD, $7F, $CC

  //vmulpd ymm1, ymm1, ymm14
  db $C4, $C1, $75, $59, $CE

  //Extract (X+Y)
  //vextractf128 xmm2, ymm1, 01b
  db $C4, $E3, $7D, $19, $CA, $01
  //(X+Y)+Z
  addsd xmm2, xmm1
  psrldq xmm1, 8
  addsd xmm1, xmm2

  movq xmm0, R13
  divsd xmm0, xmm1
  cvtsd2ss xmm0, xmm0

  comiss xmm0, dword ptr [r10]
  jb @Below
  dec X1
  jnz @X
  ret

 @Below:
  movd dword ptr [r10], xmm0
  shufps xmm0, xmm0, 00b

  //vcvtpd2ps xmm4, ymm4
  db $C5, $FD, $5A, $E4

  movd dword ptr [r11], xmm0

  dec X1
  jnz @X

end;

 

1 post / 0 new
For more complete information about compiler optimizations, see our Optimization Notice.