# Vectorization Problems

## Vectorization Problems

I have problems trying to translate code written with intrinsics to OpenCL vector code. I am working with double precision. The code is doing vectorized filtering of data using a filter of length 16.

The original code is (sorry but I could not reduce it further):

```const double filter[] __attribute__ ((aligned (16))) = {

8.4334247333529341094733325815816e-7,

-0.1290557201342060969516786758559028e-4,

0.8762984476210559564689161894116397e-4,

-0.30158038132690463167163703826169879e-3,

0.174723713672993903449447812749852942e-2,

-0.942047030201080385922711540948195075e-2,

0.2373821463724942397566389712597274535e-1,

0.612625895831207982195380597e-1,

0.9940415697834003993178616713,

-0.604895289196983516002834636e-1,

-0.2103025160930381434955489412839065067e-1,

0.1337263414854794752733423467013220997e-1,

-0.344128144493493857280881509686821861e-2,

0.49443227688689919192282259476750972e-3,

-0.5185986881173432922848639136911487e-4,

2.72734492911979659657715313017228e-6};
const double filter_u[] __attribute__ ((aligned (16))) = {

2.72734492911979659657715313017228e-6,

8.4334247333529341094733325815816e-7,

-0.1290557201342060969516786758559028e-4,

0.8762984476210559564689161894116397e-4,

-0.30158038132690463167163703826169879e-3,

0.174723713672993903449447812749852942e-2,

-0.942047030201080385922711540948195075e-2,

0.2373821463724942397566389712597274535e-1,

0.612625895831207982195380597e-1,

0.9940415697834003993178616713,

-0.604895289196983516002834636e-1,

-0.2103025160930381434955489412839065067e-1,

0.1337263414854794752733423467013220997e-1,

-0.344128144493493857280881509686821861e-2,

0.49443227688689919192282259476750972e-3,

-0.5185986881173432922848639136911487e-4,

2.72734492911979659657715313017228e-6};
#define conv_4x2_block_fused(offset_filter,offset_source,d00,d10,d20,d30) \

void conv_4x2_fused(size_t ndat, double const * source0, double const * source1,

double const * source2, double const * source3,

double       * dest){

__m128d S00,S01,S10,S11,S20,S21,S30,S31;

__m128d FA,FU;

__m128d D00,D10,D20,D30;

S00 = _mm_mul_pd(D00,FA);

S10 = _mm_mul_pd(D10,FA);

S20 = _mm_mul_pd(D20,FA);

S30 = _mm_mul_pd(D30,FA);

S01 = _mm_mul_pd(S01,FU);

S11 = _mm_mul_pd(S11,FU);

S21 = _mm_mul_pd(S21,FU);

S31 = _mm_mul_pd(S31,FU);
conv_4x2_block_fused(2,2,D00,D10,D20,D30);

conv_4x2_block_fused(4,4,D00,D10,D20,D30);

conv_4x2_block_fused(6,6,D00,D10,D20,D30);

conv_4x2_block_fused(8,8,D00,D10,D20,D30);

conv_4x2_block_fused(10,10,D00,D10,D20,D30);

conv_4x2_block_fused(12,12,D00,D10,D20,D30);

conv_4x2_block_fused(14,14,D00,D10,D20,D30);

}

```

Here is the generated assembly (gcc-4.6, but icc generates similar code):
```0000000000408970 :

408970:	66 0f 28 0e          	movapd (%rsi),%xmm1

408974:	66 0f 28 1a          	movapd (%rdx),%xmm3

408978:	66 0f 28 01          	movapd (%rcx),%xmm0

40897c:	66 0f 28 e9          	movapd %xmm1,%xmm5

408980:	66 0f 12 8e 80 00 00 	movlpd 0x80(%rsi),%xmm1

408987:	00

408988:	66 0f 28 35 80 ee 00 	movapd 0xee80(%rip),%xmm6     # 417810

40898f:	00

408990:	66 0f 28 fb          	movapd %xmm3,%xmm7

408994:	66 0f 12 9a 80 00 00 	movlpd 0x80(%rdx),%xmm3

40899b:	00

40899c:	66 41 0f 28 10       	movapd (%r8),%xmm2

4089a1:	66 0f 28 e0          	movapd %xmm0,%xmm4

4089a5:	66 0f 12 81 80 00 00 	movlpd 0x80(%rcx),%xmm0

4089ac:	00

4089ad:	66 44 0f 28 05 ca ed 	movapd 0xedca(%rip),%xmm8     # 417780

4089b4:	00 00

4089b6:	66 0f 59 ee          	mulpd  %xmm6,%xmm5

4089ba:	66 44 0f 28 56 10    	movapd 0x10(%rsi),%xmm10

4089c0:	66 0f 59 fe          	mulpd  %xmm6,%xmm7

4089c4:	66 44 0f 28 0d 53 ee 	movapd 0xee53(%rip),%xmm9     # 417820

4089cb:	00 00

4089cd:	66 0f 59 e6          	mulpd  %xmm6,%xmm4

4089d1:	66 0f 59 f2          	mulpd  %xmm2,%xmm6

4089d5:	66 44 0f 28 5a 10    	movapd 0x10(%rdx),%xmm11

4089db:	66 41 0f 12 90 80 00 	movlpd 0x80(%r8),%xmm2

4089e2:	00 00

4089e4:	66 41 0f 59 c8       	mulpd  %xmm8,%xmm1

4089e9:	66 41 0f 59 d8       	mulpd  %xmm8,%xmm3

4089ee:	66 41 0f 59 c0       	mulpd  %xmm8,%xmm0

4089f3:	66 41 0f 59 d0       	mulpd  %xmm8,%xmm2

4089f8:	66 45 0f 28 c2       	movapd %xmm10,%xmm8

4089fd:	66 45 0f 59 c1       	mulpd  %xmm9,%xmm8

408a02:	66 41 0f 58 e8       	addpd  %xmm8,%xmm5

408a07:	66 44 0f 28 05 80 ed 	movapd 0xed80(%rip),%xmm8     # 417790

408a0e:	00 00

408a10:	66 45 0f 59 d0       	mulpd  %xmm8,%xmm10

408a15:	66 41 0f 58 ca       	addpd  %xmm10,%xmm1

408a1a:	66 45 0f 28 d3       	movapd %xmm11,%xmm10

408a1f:	66 45 0f 59 d0       	mulpd  %xmm8,%xmm10

408a24:	66 45 0f 59 d9       	mulpd  %xmm9,%xmm11

408a29:	66 41 0f 58 da       	addpd  %xmm10,%xmm3

408a2e:	66 44 0f 28 51 10    	movapd 0x10(%rcx),%xmm10

408a34:	66 41 0f 58 fb       	addpd  %xmm11,%xmm7

408a39:	66 45 0f 28 58 10    	movapd 0x10(%r8),%xmm11

408a3f:	66 45 0f 28 e2       	movapd %xmm10,%xmm12

408a44:	66 45 0f 59 e1       	mulpd  %xmm9,%xmm12

408a49:	66 45 0f 59 cb       	mulpd  %xmm11,%xmm9

408a4e:	66 45 0f 59 d8       	mulpd  %xmm8,%xmm11

408a53:	66 45 0f 59 c2       	mulpd  %xmm10,%xmm8

408a58:	66 44 0f 28 56 20    	movapd 0x20(%rsi),%xmm10

408a5e:	66 41 0f 58 e4       	addpd  %xmm12,%xmm4

408a63:	66 41 0f 58 f1       	addpd  %xmm9,%xmm6

408a68:	66 44 0f 28 0d bf ed 	movapd 0xedbf(%rip),%xmm9     # 417830

408a6f:	00 00

408a71:	66 41 0f 58 d3       	addpd  %xmm11,%xmm2

408a76:	66 44 0f 28 5a 20    	movapd 0x20(%rdx),%xmm11

408a7c:	66 41 0f 58 c0       	addpd  %xmm8,%xmm0

408a81:	66 45 0f 28 c2       	movapd %xmm10,%xmm8

408a86:	66 45 0f 59 c1       	mulpd  %xmm9,%xmm8

408a8b:	66 41 0f 58 e8       	addpd  %xmm8,%xmm5

408a90:	66 44 0f 28 05 07 ed 	movapd 0xed07(%rip),%xmm8     # 4177a0

408a97:	00 00

408a99:	66 45 0f 59 d0       	mulpd  %xmm8,%xmm10

408a9e:	66 41 0f 58 ca       	addpd  %xmm10,%xmm1

408aa3:	66 45 0f 28 d3       	movapd %xmm11,%xmm10

408aa8:	66 45 0f 59 d0       	mulpd  %xmm8,%xmm10

408aad:	66 45 0f 59 d9       	mulpd  %xmm9,%xmm11

408ab2:	66 41 0f 58 da       	addpd  %xmm10,%xmm3

408ab7:	66 44 0f 28 51 20    	movapd 0x20(%rcx),%xmm10

408abd:	66 41 0f 58 fb       	addpd  %xmm11,%xmm7

408ac2:	66 45 0f 28 58 20    	movapd 0x20(%r8),%xmm11

408ac8:	66 45 0f 28 e2       	movapd %xmm10,%xmm12

408acd:	66 45 0f 59 e1       	mulpd  %xmm9,%xmm12

408ad2:	66 45 0f 59 cb       	mulpd  %xmm11,%xmm9

408ad7:	66 45 0f 59 d8       	mulpd  %xmm8,%xmm11

408adc:	66 45 0f 59 c2       	mulpd  %xmm10,%xmm8

408ae1:	66 44 0f 28 56 30    	movapd 0x30(%rsi),%xmm10

408ae7:	66 41 0f 58 e4       	addpd  %xmm12,%xmm4

408aec:	66 41 0f 58 f1       	addpd  %xmm9,%xmm6

408af1:	66 44 0f 28 0d 46 ed 	movapd 0xed46(%rip),%xmm9     # 417840

408af8:	00 00

408afa:	66 41 0f 58 d3       	addpd  %xmm11,%xmm2

408aff:	66 44 0f 28 5a 30    	movapd 0x30(%rdx),%xmm11

408b05:	66 41 0f 58 c0       	addpd  %xmm8,%xmm0

408b0a:	66 45 0f 28 c2       	movapd %xmm10,%xmm8

408b0f:	66 45 0f 59 c1       	mulpd  %xmm9,%xmm8

408b14:	66 41 0f 58 e8       	addpd  %xmm8,%xmm5

408b19:	66 44 0f 28 05 8e ec 	movapd 0xec8e(%rip),%xmm8     # 4177b0

408b20:	00 00

408b22:	66 45 0f 59 d0       	mulpd  %xmm8,%xmm10

408b27:	66 41 0f 58 ca       	addpd  %xmm10,%xmm1

408b2c:	66 45 0f 28 d3       	movapd %xmm11,%xmm10

408b31:	66 45 0f 59 d0       	mulpd  %xmm8,%xmm10

408b36:	66 45 0f 59 d9       	mulpd  %xmm9,%xmm11

408b3b:	66 41 0f 58 da       	addpd  %xmm10,%xmm3

408b40:	66 44 0f 28 51 30    	movapd 0x30(%rcx),%xmm10

408b46:	66 41 0f 58 fb       	addpd  %xmm11,%xmm7

408b4b:	66 45 0f 28 58 30    	movapd 0x30(%r8),%xmm11

408b51:	66 45 0f 28 e2       	movapd %xmm10,%xmm12

408b56:	66 45 0f 59 e1       	mulpd  %xmm9,%xmm12

408b5b:	66 45 0f 59 cb       	mulpd  %xmm11,%xmm9

408b60:	66 45 0f 59 d8       	mulpd  %xmm8,%xmm11

408b65:	66 45 0f 59 c2       	mulpd  %xmm10,%xmm8

408b6a:	66 44 0f 28 56 40    	movapd 0x40(%rsi),%xmm10

408b70:	66 41 0f 58 e4       	addpd  %xmm12,%xmm4

408b75:	66 41 0f 58 f1       	addpd  %xmm9,%xmm6

408b7a:	66 44 0f 28 0d cd ec 	movapd 0xeccd(%rip),%xmm9     # 417850

408b81:	00 00

408b83:	66 41 0f 58 d3       	addpd  %xmm11,%xmm2

408b88:	66 44 0f 28 5a 40    	movapd 0x40(%rdx),%xmm11

408b8e:	66 41 0f 58 c0       	addpd  %xmm8,%xmm0

408b93:	66 45 0f 28 c2       	movapd %xmm10,%xmm8

408b98:	66 45 0f 59 c1       	mulpd  %xmm9,%xmm8

408b9d:	66 41 0f 58 e8       	addpd  %xmm8,%xmm5

408ba2:	66 44 0f 28 05 15 ec 	movapd 0xec15(%rip),%xmm8     # 4177c0

408ba9:	00 00

408bab:	66 45 0f 59 d0       	mulpd  %xmm8,%xmm10

408bb0:	66 41 0f 58 ca       	addpd  %xmm10,%xmm1

408bb5:	66 45 0f 28 d3       	movapd %xmm11,%xmm10

408bba:	66 45 0f 59 d0       	mulpd  %xmm8,%xmm10

408bbf:	66 45 0f 59 d9       	mulpd  %xmm9,%xmm11

408bc4:	66 41 0f 58 da       	addpd  %xmm10,%xmm3

408bc9:	66 44 0f 28 51 40    	movapd 0x40(%rcx),%xmm10

408bcf:	66 41 0f 58 fb       	addpd  %xmm11,%xmm7

408bd4:	66 45 0f 28 58 40    	movapd 0x40(%r8),%xmm11

408bda:	66 45 0f 28 e2       	movapd %xmm10,%xmm12

408bdf:	66 45 0f 59 e1       	mulpd  %xmm9,%xmm12

408be4:	66 45 0f 59 cb       	mulpd  %xmm11,%xmm9

408be9:	66 45 0f 59 d8       	mulpd  %xmm8,%xmm11

408bee:	66 45 0f 59 c2       	mulpd  %xmm10,%xmm8

408bf3:	66 44 0f 28 56 50    	movapd 0x50(%rsi),%xmm10

408bf9:	66 41 0f 58 e4       	addpd  %xmm12,%xmm4

408bfe:	66 41 0f 58 f1       	addpd  %xmm9,%xmm6

408c03:	66 44 0f 28 0d 54 ec 	movapd 0xec54(%rip),%xmm9     # 417860

408c0a:	00 00

408c0c:	66 41 0f 58 d3       	addpd  %xmm11,%xmm2

408c11:	66 44 0f 28 5a 50    	movapd 0x50(%rdx),%xmm11

408c17:	66 41 0f 58 c0       	addpd  %xmm8,%xmm0

408c1c:	66 45 0f 28 c2       	movapd %xmm10,%xmm8

408c21:	66 45 0f 59 c1       	mulpd  %xmm9,%xmm8

408c26:	66 41 0f 58 e8       	addpd  %xmm8,%xmm5

408c2b:	66 44 0f 28 05 9c eb 	movapd 0xeb9c(%rip),%xmm8     # 4177d0

408c32:	00 00

408c34:	66 45 0f 59 d0       	mulpd  %xmm8,%xmm10

408c39:	66 41 0f 58 ca       	addpd  %xmm10,%xmm1

408c3e:	66 45 0f 28 d3       	movapd %xmm11,%xmm10

408c43:	66 45 0f 59 d0       	mulpd  %xmm8,%xmm10

408c48:	66 45 0f 59 d9       	mulpd  %xmm9,%xmm11

408c4d:	66 41 0f 58 da       	addpd  %xmm10,%xmm3

408c52:	66 44 0f 28 51 50    	movapd 0x50(%rcx),%xmm10

408c58:	66 41 0f 58 fb       	addpd  %xmm11,%xmm7

408c5d:	66 45 0f 28 58 50    	movapd 0x50(%r8),%xmm11

408c63:	66 45 0f 28 e2       	movapd %xmm10,%xmm12

408c68:	66 45 0f 59 e1       	mulpd  %xmm9,%xmm12

408c6d:	66 45 0f 59 cb       	mulpd  %xmm11,%xmm9

408c72:	66 45 0f 59 d8       	mulpd  %xmm8,%xmm11

408c77:	66 45 0f 59 c2       	mulpd  %xmm10,%xmm8

408c7c:	66 44 0f 28 56 60    	movapd 0x60(%rsi),%xmm10

408c82:	66 41 0f 58 e4       	addpd  %xmm12,%xmm4

408c87:	66 41 0f 58 f1       	addpd  %xmm9,%xmm6

408c8c:	66 44 0f 28 0d db eb 	movapd 0xebdb(%rip),%xmm9     # 417870

408c93:	00 00

408c95:	66 41 0f 58 d3       	addpd  %xmm11,%xmm2

408c9a:	66 44 0f 28 5a 60    	movapd 0x60(%rdx),%xmm11

408ca0:	66 41 0f 58 c0       	addpd  %xmm8,%xmm0

408ca5:	66 45 0f 28 c2       	movapd %xmm10,%xmm8

408caa:	66 45 0f 59 c1       	mulpd  %xmm9,%xmm8

408caf:	66 41 0f 58 e8       	addpd  %xmm8,%xmm5

408cb4:	66 44 0f 28 05 23 eb 	movapd 0xeb23(%rip),%xmm8     # 4177e0

408cbb:	00 00

408cbd:	66 45 0f 59 d0       	mulpd  %xmm8,%xmm10

408cc2:	66 41 0f 58 ca       	addpd  %xmm10,%xmm1

408cc7:	66 45 0f 28 d3       	movapd %xmm11,%xmm10

408ccc:	66 45 0f 59 d0       	mulpd  %xmm8,%xmm10

408cd1:	66 45 0f 59 d9       	mulpd  %xmm9,%xmm11

408cd6:	66 41 0f 58 da       	addpd  %xmm10,%xmm3

408cdb:	66 44 0f 28 51 60    	movapd 0x60(%rcx),%xmm10

408ce1:	66 41 0f 58 fb       	addpd  %xmm11,%xmm7

408ce6:	66 45 0f 28 58 60    	movapd 0x60(%r8),%xmm11

408cec:	66 45 0f 28 e2       	movapd %xmm10,%xmm12

408cf1:	66 45 0f 59 e1       	mulpd  %xmm9,%xmm12

408cf6:	66 45 0f 59 cb       	mulpd  %xmm11,%xmm9

408cfb:	66 45 0f 59 d8       	mulpd  %xmm8,%xmm11

408d00:	66 45 0f 59 c2       	mulpd  %xmm10,%xmm8

408d05:	66 44 0f 28 56 70    	movapd 0x70(%rsi),%xmm10

408d0b:	66 41 0f 58 e4       	addpd  %xmm12,%xmm4

408d10:	66 41 0f 58 f1       	addpd  %xmm9,%xmm6

408d15:	66 44 0f 28 0d 62 eb 	movapd 0xeb62(%rip),%xmm9     # 417880

408d1c:	00 00

408d1e:	66 41 0f 58 d3       	addpd  %xmm11,%xmm2

408d23:	66 44 0f 28 5a 70    	movapd 0x70(%rdx),%xmm11

408d29:	66 41 0f 58 c0       	addpd  %xmm8,%xmm0

408d2e:	66 45 0f 28 c2       	movapd %xmm10,%xmm8

408d33:	66 45 0f 59 c1       	mulpd  %xmm9,%xmm8

408d38:	66 41 0f 58 e8       	addpd  %xmm8,%xmm5

408d3d:	66 44 0f 28 05 aa ea 	movapd 0xeaaa(%rip),%xmm8     # 4177f0

408d44:	00 00

408d46:	66 45 0f 59 d0       	mulpd  %xmm8,%xmm10

408d4b:	66 41 0f 58 ca       	addpd  %xmm10,%xmm1

408d50:	66 45 0f 28 d3       	movapd %xmm11,%xmm10

408d55:	66 45 0f 59 d0       	mulpd  %xmm8,%xmm10

408d5a:	66 45 0f 59 d9       	mulpd  %xmm9,%xmm11

408d5f:	66 41 0f 58 da       	addpd  %xmm10,%xmm3

408d64:	66 44 0f 28 51 70    	movapd 0x70(%rcx),%xmm10

408d6a:	66 41 0f 58 fb       	addpd  %xmm11,%xmm7

408d6f:	66 45 0f 28 58 70    	movapd 0x70(%r8),%xmm11

408d75:	66 45 0f 28 e2       	movapd %xmm10,%xmm12

408d7a:	66 45 0f 59 e1       	mulpd  %xmm9,%xmm12

408d7f:	66 0f 7c cb          	haddpd %xmm3,%xmm1

408d83:	66 45 0f 59 cb       	mulpd  %xmm11,%xmm9

408d88:	66 0f 7c ef          	haddpd %xmm7,%xmm5

408d8c:	66 45 0f 59 d8       	mulpd  %xmm8,%xmm11

408d91:	66 45 0f 59 c2       	mulpd  %xmm10,%xmm8

408d96:	66 41 0f 29 29       	movapd %xmm5,(%r9)

408d9b:	66 41 0f 58 e4       	addpd  %xmm12,%xmm4

408da0:	66 41 0f 58 f1       	addpd  %xmm9,%xmm6

408da5:	66 41 0f 58 d3       	addpd  %xmm11,%xmm2

408daa:	66 41 0f 58 c0       	addpd  %xmm8,%xmm0

408daf:	66 0f 7c e6          	haddpd %xmm6,%xmm4

408db3:	66 0f 7c c2          	haddpd %xmm2,%xmm0

408db7:	66 41 0f 29 61 10    	movapd %xmm4,0x10(%r9)

408dbd:	66 41 0f 29 0c f9    	movapd %xmm1,(%r9,%rdi,8)

408dc3:	66 41 0f 29 44 f9 10 	movapd %xmm0,0x10(%r9,%rdi,8)

408dca:	c3                   	retq

408dcb:	0f 1f 44 00 00       	nopl   0x0(%rax,%rax,1)

```

Here is the OpenCL equivalent code:

```#ifdef cl_khr_fp64

#pragma OPENCL EXTENSION cl_khr_fp64 : enable

#endif
const double2 FA[8] = {

(double2)(8.4334247333529341094733325815816e-7,

-0.1290557201342060969516786758559028e-4),

(double2)(0.8762984476210559564689161894116397e-4,

-0.30158038132690463167163703826169879e-3),

(double2)(0.174723713672993903449447812749852942e-2,

-0.942047030201080385922711540948195075e-2),

(double2)(0.2373821463724942397566389712597274535e-1,

0.612625895831207982195380597e-1),

(double2)(0.9940415697834003993178616713,

-0.604895289196983516002834636e-1),

(double2)(-0.2103025160930381434955489412839065067e-1,

0.1337263414854794752733423467013220997e-1),

(double2)(-0.344128144493493857280881509686821861e-2,

0.49443227688689919192282259476750972e-3),

(double2)(-0.5185986881173432922848639136911487e-4,

2.72734492911979659657715313017228e-6)};

const double2 FU[8] = {

(double2)(2.72734492911979659657715313017228e-6,

8.4334247333529341094733325815816e-7),

(double2)(-0.1290557201342060969516786758559028e-4,

0.8762984476210559564689161894116397e-4),

(double2)(-0.30158038132690463167163703826169879e-3,

0.174723713672993903449447812749852942e-2),

(double2)(-0.942047030201080385922711540948195075e-2,

0.2373821463724942397566389712597274535e-1),

(double2)(0.612625895831207982195380597e-1,

0.9940415697834003993178616713),

(double2)(-0.604895289196983516002834636e-1,

-0.2103025160930381434955489412839065067e-1),

(double2)(0.1337263414854794752733423467013220997e-1,

-0.344128144493493857280881509686821861e-2),

(double2)(0.49443227688689919192282259476750972e-3,

-0.5185986881173432922848639136911487e-4)};
#define conv_4x2_block_fused(o_f,o_s,d00,d10,d20,d30)

d00 = in0[o_s];

A = FA[o_f];

S00 += d00 * A;

U = FU[o_f];

d10 = in1[o_s];

S01 += d00 * U;

S11 += d10 * U;

d20 = in2[o_s];

S10 += d10 * A;

d30 = in3[o_s];

S20 += d20 * A;

S30 += d30 * A;

S31 += d30 * U;

S21 += d20 * U;
__kernel void magic_filter5(const ulong n, const ulong ndat, __global const double2 *in, __global double2 *out) {

const unsigned int n2 = n/2;

const unsigned int ndat2 = ndat/2;

__global const double2 *in0 = in;

__global const double2 *in1 = in0 + n2;

__global const double2 *in2 = in1 + n2;

__global const double2 *in3 = in2 + n2;

__global double2 *out = out + 4;

double2 D00,D10,D20,D30;

double2 S00,S01,S10,S11,S20,S21,S30,S31;

double2 A, U;
A = FA[0];

D00 = in0[0];

S00 = D00 * A;

D10 = in1[0];

S10 = D10 * A;

D20 = in2[0];

S20 = D20 * A;

D30 = in3[0];

S30 = D30 * A;
U = FU[0];

D00.x = in0[8].x;

S01 = D00 * U;

D10.x = in1[8].x;

S11 = D10 * U;

D20.x = in2[8].x;

S21 = D20 * U;

D30.x = in3[8].x;

S31 = D30 * U;
conv_4x2_block_fused(1,1,D00,D10,D20,D30);

conv_4x2_block_fused(2,2,D00,D10,D20,D30);

conv_4x2_block_fused(3,3,D00,D10,D20,D30);

conv_4x2_block_fused(4,4,D00,D10,D20,D30);

conv_4x2_block_fused(5,5,D00,D10,D20,D30);

conv_4x2_block_fused(6,6,D00,D10,D20,D30);

conv_4x2_block_fused(7,7,D00,D10,D20,D30);
out[0] = (double2)(S00.x + S00.y, S10.x + S10.y);

out[1] = (double2)(S20.x + S20.y, S30.x + S30.y);

out[ndat2 + 0] = (double2)(S01.x + S01.y, S11.x + S11.y);

out[ndat2 + 1] = (double2)(S21.x + S21.y, S31.x + S31.y);
}```

And here is the generated assembly:

```	.section	.rodata.cst16,"aM",@progbits,16

.align	16

.LCPI2_0:                               # constant pool <2 x double>

.quad	4546094365667641806     # double 8.762984e-05

.quad	-4669173237098585703    # double -3.015804e-04

.LCPI2_1:                               # constant pool <2 x double>

.quad	4516068371457184450     # double 8.433425e-07

.quad	-4689636306257622577    # double -1.290557e-05

.LCPI2_2:                               # constant pool <2 x double>

.quad	4565700531973276514     # double 1.747237e-03

.quad	-4646787883676373161    # double -9.420470e-03

.LCPI2_3:                               # constant pool <2 x double>

.quad	4582499295942488407     # double 2.373821e-02

.quad	4588989690655579855     # double 6.126259e-02

.LCPI2_4:                               # constant pool <2 x double>

.quad	4607128750031811026     # double 9.940416e-01

.quad	-4634493755982099125    # double -6.048953e-02

.LCPI2_5:                               # constant pool <2 x double>

.quad	-4641653258114437092    # double -2.103025e-02

.quad	4578862420522603841     # double 1.337263e-02

.LCPI2_6:                               # constant pool <2 x double>

.quad	-4653290559565923332    # double -3.441281e-03

.quad	4557699556108328520     # double 4.944323e-04

.LCPI2_7:                               # constant pool <2 x double>

.quad	-4680594046271356616    # double -5.185987e-05

.quad	4523550187392857396     # double 2.727345e-06

.LCPI2_8:                               # constant pool <2 x double>

.quad	-4689636306257622577    # double -1.290557e-05

.quad	4546094365667641806     # double 8.762984e-05

.LCPI2_9:                               # constant pool <2 x double>

.quad	4523550187392857396     # double 2.727345e-06

.quad	4516068371457184450     # double 8.433425e-07

.LCPI2_10:                              # constant pool <2 x double>

.quad	-4669173237098585703    # double -3.015804e-04

.quad	4565700531973276514     # double 1.747237e-03

.LCPI2_11:                              # constant pool <2 x double>

.quad	-4646787883676373161    # double -9.420470e-03

.quad	4582499295942488407     # double 2.373821e-02

.LCPI2_12:                              # constant pool <2 x double>

.quad	4588989690655579855     # double 6.126259e-02

.quad	4607128750031811026     # double 9.940416e-01

.LCPI2_13:                              # constant pool <2 x double>

.quad	-4634493755982099125    # double -6.048953e-02

.quad	-4641653258114437092    # double -2.103025e-02

.LCPI2_14:                              # constant pool <2 x double>

.quad	4578862420522603841     # double 1.337263e-02

.quad	-4653290559565923332    # double -3.441281e-03

.LCPI2_15:                              # constant pool <2 x double>

.quad	4557699556108328520     # double 4.944323e-04

.quad	-4680594046271356616    # double -5.185987e-05

.text

.globl	magic_filter5

.align	16, 0x90

.type	magic_filter5,@function

magic_filter5:                          # @magic_filter5

# BB#0:                                 # %FirstBB

push	RBP

push	R15

push	R14

push	R13

push	R12

push	RBX

sub	RSP, 344

mov	RAX, QWORD PTR [RSP + 400]

shr	RAX

mov	EAX, EAX

lea	RCX, QWORD PTR [RAX + 2*RAX]

mov	RDX, RAX

shl	RDX, 5

mov	RSI, QWORD PTR [RSP + 416]

lea	RDI, QWORD PTR [RDX + RSI + 128]

mov	QWORD PTR [RSP + 8], RDI # 8-byte Spill

lea	RDI, QWORD PTR [RSI + RDX + 16]

mov	QWORD PTR [RSP], RDI    # 8-byte Spill

lea	RDI, QWORD PTR [RDX + RSI + 32]

mov	QWORD PTR [RSP - 8], RDI # 8-byte Spill

lea	RDI, QWORD PTR [RDX + RSI + 48]

mov	QWORD PTR [RSP - 16], RDI # 8-byte Spill

lea	RDI, QWORD PTR [RDX + RSI + 64]

mov	QWORD PTR [RSP - 24], RDI # 8-byte Spill

lea	RDI, QWORD PTR [RDX + RSI + 80]

mov	QWORD PTR [RSP - 32], RDI # 8-byte Spill

lea	RDI, QWORD PTR [RDX + RSI + 96]

mov	QWORD PTR [RSP - 40], RDI # 8-byte Spill

lea	RDI, QWORD PTR [RDX + RSI + 112]

mov	QWORD PTR [RSP - 48], RDI # 8-byte Spill

shl	RAX, 4

lea	RDI, QWORD PTR [RSI + RAX + 128]

mov	QWORD PTR [RSP - 56], RDI # 8-byte Spill

lea	RDI, QWORD PTR [RSI + RAX + 16]

mov	QWORD PTR [RSP - 64], RDI # 8-byte Spill

lea	RDI, QWORD PTR [RSI + RAX + 32]

mov	QWORD PTR [RSP - 72], RDI # 8-byte Spill

lea	RDI, QWORD PTR [RSI + RAX + 48]

mov	QWORD PTR [RSP - 80], RDI # 8-byte Spill

lea	RDI, QWORD PTR [RSI + RAX + 64]

mov	QWORD PTR [RSP - 88], RDI # 8-byte Spill

lea	RDI, QWORD PTR [RSI + RAX + 80]

mov	QWORD PTR [RSP - 96], RDI # 8-byte Spill

lea	RDI, QWORD PTR [RSI + RAX + 96]

mov	QWORD PTR [RSP - 104], RDI # 8-byte Spill

lea	RDI, QWORD PTR [RSI + RAX + 112]

mov	QWORD PTR [RSP - 112], RDI # 8-byte Spill

shl	RCX, 4

lea	RDI, QWORD PTR [RSI + RCX + 128]

mov	QWORD PTR [RSP - 120], RDI # 8-byte Spill

lea	RDI, QWORD PTR [RSI + RCX + 16]

mov	QWORD PTR [RSP - 128], RDI # 8-byte Spill

lea	RDI, QWORD PTR [RSI + RCX + 32]

lea	R8, QWORD PTR [RSI + RCX + 48]

lea	R9, QWORD PTR [RSI + RCX + 64]

lea	R10, QWORD PTR [RSI + RCX + 80]

lea	R11, QWORD PTR [RSI + RCX + 96]

lea	RBX, QWORD PTR [RSI + RCX + 112]

mov	R14, QWORD PTR [RSP + 408]

movabs	R15, 8589934590

and	R15, R14

mov	R12, QWORD PTR [RSP + 424]

lea	R15, QWORD PTR [R12 + 8*R15 + 64]

shr	R14

inc	R14D

shl	R14, 4

lea	R14, QWORD PTR [R14 + R12 + 64]

mov	R13, -1

.align	16, 0x90

.LBB2_1:                                # %SyncBB

# =>This Inner Loop Header: Depth=1

mov	RBP, QWORD PTR [RSP - 64] # 8-byte Reload

movapd	XMM0, XMMWORD PTR [RBP]

movapd	XMMWORD PTR [RSP + 304], XMM0 # 16-byte Spill

movapd	XMM1, XMM0

mulpd	XMM1, XMMWORD PTR [RIP + .LCPI2_0]

movapd	XMM2, XMMWORD PTR [RAX]

movapd	XMM3, XMM2

mulpd	XMM3, XMMWORD PTR [RIP + .LCPI2_1]

mov	RBP, QWORD PTR [RSP - 72] # 8-byte Reload

movapd	XMM0, XMMWORD PTR [RBP]

movapd	XMMWORD PTR [RSP + 64], XMM0 # 16-byte Spill

movapd	XMM1, XMM0

mulpd	XMM1, XMMWORD PTR [RIP + .LCPI2_2]

mov	RBP, QWORD PTR [RSP - 80] # 8-byte Reload

movapd	XMM0, XMMWORD PTR [RBP]

movapd	XMMWORD PTR [RSP + 288], XMM0 # 16-byte Spill

movapd	XMM3, XMM0

mulpd	XMM3, XMMWORD PTR [RIP + .LCPI2_3]

mov	RBP, QWORD PTR [RSP - 88] # 8-byte Reload

movapd	XMM0, XMMWORD PTR [RBP]

movapd	XMMWORD PTR [RSP + 256], XMM0 # 16-byte Spill

movapd	XMM1, XMM0

mulpd	XMM1, XMMWORD PTR [RIP + .LCPI2_4]

mov	RBP, QWORD PTR [RSP - 96] # 8-byte Reload

movapd	XMM0, XMMWORD PTR [RBP]

movapd	XMMWORD PTR [RSP + 144], XMM0 # 16-byte Spill

movapd	XMM3, XMM0

mulpd	XMM3, XMMWORD PTR [RIP + .LCPI2_5]

mov	RBP, QWORD PTR [RSP - 104] # 8-byte Reload

movapd	XMM0, XMMWORD PTR [RBP]

movapd	XMMWORD PTR [RSP + 128], XMM0 # 16-byte Spill

movapd	XMM1, XMM0

mulpd	XMM1, XMMWORD PTR [RIP + .LCPI2_6]

mov	RBP, QWORD PTR [RSP - 112] # 8-byte Reload

movapd	XMM3, XMMWORD PTR [RBP]

movapd	XMM4, XMM3

mulpd	XMM4, XMMWORD PTR [RIP + .LCPI2_7]

movapd	XMM1, XMM4

unpckhpd	XMM1, XMM1      # xmm1 = xmm1[1,1]

movapd	XMM4, XMMWORD PTR [RSI]

movapd	XMM5, XMMWORD PTR [RSI + 16]

movapd	XMMWORD PTR [RSP + 320], XMM5 # 16-byte Spill

movapd	XMM0, XMMWORD PTR [RSI + 32]

movapd	XMMWORD PTR [RSP + 112], XMM0 # 16-byte Spill

movapd	XMM5, XMMWORD PTR [RSI + 48]

movapd	XMMWORD PTR [RSP + 272], XMM5 # 16-byte Spill

movapd	XMM6, XMMWORD PTR [RSP + 320] # 16-byte Reload

mulpd	XMM6, XMMWORD PTR [RIP + .LCPI2_0]

movapd	XMM7, XMM4

mulpd	XMM7, XMMWORD PTR [RIP + .LCPI2_1]

movapd	XMM6, XMM0

mulpd	XMM6, XMMWORD PTR [RIP + .LCPI2_2]

movapd	XMM7, XMM5

mulpd	XMM7, XMMWORD PTR [RIP + .LCPI2_3]

movapd	XMM0, XMMWORD PTR [RSI + 64]

movapd	XMMWORD PTR [RSP + 96], XMM0 # 16-byte Spill

movapd	XMM5, XMM0

mulpd	XMM5, XMMWORD PTR [RIP + .LCPI2_4]

movapd	XMM0, XMMWORD PTR [RSI + 80]

movapd	XMMWORD PTR [RSP + 80], XMM0 # 16-byte Spill

movapd	XMM6, XMM0

mulpd	XMM6, XMMWORD PTR [RIP + .LCPI2_5]

movapd	XMM0, XMMWORD PTR [RSI + 96]

movapd	XMMWORD PTR [RSP + 16], XMM0 # 16-byte Spill

movapd	XMM5, XMM0

mulpd	XMM5, XMMWORD PTR [RIP + .LCPI2_6]

movapd	XMM6, XMMWORD PTR [RSI + 112]

movapd	XMM7, XMM6

mulpd	XMM7, XMMWORD PTR [RIP + .LCPI2_7]

movapd	XMM5, XMM7

unpckhpd	XMM5, XMM5      # xmm5 = xmm5[1,1]

unpcklpd	XMM5, XMM1      # xmm5 = xmm5[0],xmm1[0]

movapd	XMM1, XMMWORD PTR [RDX]

movapd	XMM7, XMM1

mulpd	XMM7, XMMWORD PTR [RIP + .LCPI2_1]

mov	RBP, QWORD PTR [RSP + 8] # 8-byte Reload

movlpd	XMM1, QWORD PTR [RBP]

movapd	XMM8, XMMWORD PTR [RCX]

movapd	XMM9, XMM8

mulpd	XMM9, XMMWORD PTR [RIP + .LCPI2_1]

mov	RBP, QWORD PTR [RSP - 120] # 8-byte Reload

movlpd	XMM8, QWORD PTR [RBP]

movapd	XMM10, XMMWORD PTR [RSP + 320] # 16-byte Reload

mulpd	XMM10, XMMWORD PTR [RIP + .LCPI2_8]

movapd	XMMWORD PTR [RSP + 320], XMM10 # 16-byte Spill

movlpd	XMM4, QWORD PTR [RSI + 128]

mov	RBP, QWORD PTR [RSP - 56] # 8-byte Reload

movlpd	XMM2, QWORD PTR [RBP]

mov	RBP, QWORD PTR [RSP - 48] # 8-byte Reload

movapd	XMM10, XMMWORD PTR [RBP]

movapd	XMMWORD PTR [RSP + 48], XMM10 # 16-byte Spill

mov	RBP, QWORD PTR [RSP - 40] # 8-byte Reload

movapd	XMM11, XMMWORD PTR [RBP]

movapd	XMMWORD PTR [RSP + 240], XMM11 # 16-byte Spill

mov	RBP, QWORD PTR [RSP - 32] # 8-byte Reload

movapd	XMM12, XMMWORD PTR [RBP]

movapd	XMMWORD PTR [RSP + 224], XMM12 # 16-byte Spill

mov	RBP, QWORD PTR [RSP - 24] # 8-byte Reload

movapd	XMM13, XMMWORD PTR [RBP]

movapd	XMMWORD PTR [RSP + 208], XMM13 # 16-byte Spill

mov	RBP, QWORD PTR [RSP - 16] # 8-byte Reload

movapd	XMM14, XMMWORD PTR [RBP]

movapd	XMMWORD PTR [RSP + 192], XMM14 # 16-byte Spill

mov	RBP, QWORD PTR [RSP - 8] # 8-byte Reload

movapd	XMM15, XMMWORD PTR [RBP]

movapd	XMMWORD PTR [RSP + 160], XMM15 # 16-byte Spill

mov	RBP, QWORD PTR [RSP]    # 8-byte Reload

movapd	XMM0, XMMWORD PTR [RBP]

movapd	XMMWORD PTR [RSP + 176], XMM0 # 16-byte Spill

movapd	XMM0, XMMWORD PTR [RBX]

movapd	XMM10, XMMWORD PTR [R11]

movapd	XMM11, XMMWORD PTR [R10]

movapd	XMM12, XMMWORD PTR [R9]

movapd	XMM13, XMMWORD PTR [R8]

movapd	XMM14, XMMWORD PTR [RDI]

mov	RBP, QWORD PTR [RSP - 128] # 8-byte Reload

movapd	XMM15, XMMWORD PTR [RBP]

movapd	XMMWORD PTR [RSP + 32], XMM15 # 16-byte Spill

movapd	XMMWORD PTR [R12 + 64], XMM5

movapd	XMM5, XMM15

mulpd	XMM5, XMMWORD PTR [RIP + .LCPI2_0]

movapd	XMM9, XMM14

mulpd	XMM9, XMMWORD PTR [RIP + .LCPI2_2]

movapd	XMM5, XMM13

mulpd	XMM5, XMMWORD PTR [RIP + .LCPI2_3]

movapd	XMM9, XMM12

mulpd	XMM9, XMMWORD PTR [RIP + .LCPI2_4]

movapd	XMM5, XMM11

mulpd	XMM5, XMMWORD PTR [RIP + .LCPI2_5]

movapd	XMM9, XMM10

mulpd	XMM9, XMMWORD PTR [RIP + .LCPI2_6]

movapd	XMM5, XMM0

mulpd	XMM5, XMMWORD PTR [RIP + .LCPI2_7]

movapd	XMM9, XMM5

unpckhpd	XMM9, XMM9      # xmm9 = xmm9[1,1]

movapd	XMM5, XMMWORD PTR [RSP + 176] # 16-byte Reload

mulpd	XMM5, XMMWORD PTR [RIP + .LCPI2_0]

movapd	XMM7, XMMWORD PTR [RSP + 160] # 16-byte Reload

mulpd	XMM7, XMMWORD PTR [RIP + .LCPI2_2]

movapd	XMM5, XMMWORD PTR [RSP + 192] # 16-byte Reload

mulpd	XMM5, XMMWORD PTR [RIP + .LCPI2_3]

movapd	XMM7, XMMWORD PTR [RSP + 208] # 16-byte Reload

mulpd	XMM7, XMMWORD PTR [RIP + .LCPI2_4]

movapd	XMM5, XMMWORD PTR [RSP + 224] # 16-byte Reload

mulpd	XMM5, XMMWORD PTR [RIP + .LCPI2_5]

movapd	XMM7, XMMWORD PTR [RSP + 240] # 16-byte Reload

mulpd	XMM7, XMMWORD PTR [RIP + .LCPI2_6]

movapd	XMM5, XMMWORD PTR [RSP + 48] # 16-byte Reload

mulpd	XMM5, XMMWORD PTR [RIP + .LCPI2_7]

movapd	XMM7, XMM5

unpckhpd	XMM7, XMM7      # xmm7 = xmm7[1,1]

unpcklpd	XMM7, XMM9      # xmm7 = xmm7[0],xmm9[0]

movapd	XMMWORD PTR [R12 + 80], XMM7

movapd	XMM5, XMMWORD PTR [RSP + 304] # 16-byte Reload

mulpd	XMM5, XMMWORD PTR [RIP + .LCPI2_8]

movapd	XMMWORD PTR [RSP + 304], XMM5 # 16-byte Spill

movapd	XMM5, XMMWORD PTR [RIP + .LCPI2_9]

mulpd	XMM2, XMM5

addpd	XMM2, XMMWORD PTR [RSP + 304] # 16-byte Folded Reload

movapd	XMM7, XMMWORD PTR [RIP + .LCPI2_10]

movapd	XMM9, XMMWORD PTR [RSP + 64] # 16-byte Reload

mulpd	XMM9, XMM7

movapd	XMMWORD PTR [RSP + 64], XMM9 # 16-byte Spill

movapd	XMM2, XMMWORD PTR [RSP + 288] # 16-byte Reload

mulpd	XMM2, XMMWORD PTR [RIP + .LCPI2_11]

movapd	XMMWORD PTR [RSP + 288], XMM2 # 16-byte Spill

movapd	XMM2, XMMWORD PTR [RSP + 256] # 16-byte Reload

mulpd	XMM2, XMMWORD PTR [RIP + .LCPI2_12]

addpd	XMM2, XMMWORD PTR [RSP + 288] # 16-byte Folded Reload

movapd	XMMWORD PTR [RSP + 256], XMM2 # 16-byte Spill

movapd	XMM2, XMMWORD PTR [RSP + 144] # 16-byte Reload

mulpd	XMM2, XMMWORD PTR [RIP + .LCPI2_13]

addpd	XMM2, XMMWORD PTR [RSP + 256] # 16-byte Folded Reload

movapd	XMMWORD PTR [RSP + 144], XMM2 # 16-byte Spill

movapd	XMM2, XMMWORD PTR [RIP + .LCPI2_14]

movapd	XMM9, XMMWORD PTR [RSP + 128] # 16-byte Reload

mulpd	XMM9, XMM2

addpd	XMM9, XMMWORD PTR [RSP + 144] # 16-byte Folded Reload

movapd	XMMWORD PTR [RSP + 128], XMM9 # 16-byte Spill

movapd	XMM9, XMMWORD PTR [RIP + .LCPI2_15]

mulpd	XMM3, XMM9

addpd	XMM3, XMMWORD PTR [RSP + 128] # 16-byte Folded Reload

movapd	XMM15, XMM3

unpckhpd	XMM15, XMM15    # xmm15 = xmm15[1,1]

mulpd	XMM4, XMM5

addpd	XMM4, XMMWORD PTR [RSP + 320] # 16-byte Folded Reload

movapd	XMM3, XMMWORD PTR [RSP + 112] # 16-byte Reload

mulpd	XMM3, XMM7

movapd	XMMWORD PTR [RSP + 112], XMM3 # 16-byte Spill

movapd	XMM3, XMMWORD PTR [RSP + 272] # 16-byte Reload

mulpd	XMM3, XMMWORD PTR [RIP + .LCPI2_11]

addpd	XMM3, XMMWORD PTR [RSP + 112] # 16-byte Folded Reload

movapd	XMMWORD PTR [RSP + 272], XMM3 # 16-byte Spill

movapd	XMM3, XMMWORD PTR [RSP + 96] # 16-byte Reload

mulpd	XMM3, XMMWORD PTR [RIP + .LCPI2_12]

addpd	XMM3, XMMWORD PTR [RSP + 272] # 16-byte Folded Reload

movapd	XMMWORD PTR [RSP + 96], XMM3 # 16-byte Spill

movapd	XMM3, XMMWORD PTR [RSP + 80] # 16-byte Reload

mulpd	XMM3, XMMWORD PTR [RIP + .LCPI2_13]

addpd	XMM3, XMMWORD PTR [RSP + 96] # 16-byte Folded Reload

movapd	XMMWORD PTR [RSP + 80], XMM3 # 16-byte Spill

movapd	XMM3, XMMWORD PTR [RSP + 16] # 16-byte Reload

mulpd	XMM3, XMM2

addpd	XMM3, XMMWORD PTR [RSP + 80] # 16-byte Folded Reload

movapd	XMMWORD PTR [RSP + 16], XMM3 # 16-byte Spill

mulpd	XMM6, XMM9

movapd	XMM3, XMM6

unpckhpd	XMM3, XMM3      # xmm3 = xmm3[1,1]

unpcklpd	XMM3, XMM15     # xmm3 = xmm3[0],xmm15[0]

movapd	XMMWORD PTR [R15], XMM3

movapd	XMM15, XMMWORD PTR [RSP + 32] # 16-byte Reload

mulpd	XMM15, XMMWORD PTR [RIP + .LCPI2_8]

movapd	XMMWORD PTR [RSP + 32], XMM15 # 16-byte Spill

mulpd	XMM8, XMM5

mulpd	XMM14, XMM7

mulpd	XMM13, XMMWORD PTR [RIP + .LCPI2_11]

mulpd	XMM12, XMMWORD PTR [RIP + .LCPI2_12]

mulpd	XMM11, XMMWORD PTR [RIP + .LCPI2_13]

mulpd	XMM10, XMM2

mulpd	XMM0, XMM9

movapd	XMM3, XMM0

unpckhpd	XMM3, XMM3      # xmm3 = xmm3[1,1]

movapd	XMM0, XMMWORD PTR [RSP + 176] # 16-byte Reload

mulpd	XMM0, XMMWORD PTR [RIP + .LCPI2_8]

movapd	XMMWORD PTR [RSP + 176], XMM0 # 16-byte Spill

mulpd	XMM1, XMM5

movapd	XMM15, XMMWORD PTR [RSP + 160] # 16-byte Reload

mulpd	XMM15, XMM7

movapd	XMMWORD PTR [RSP + 160], XMM15 # 16-byte Spill

movapd	XMM14, XMMWORD PTR [RSP + 192] # 16-byte Reload

mulpd	XMM14, XMMWORD PTR [RIP + .LCPI2_11]

movapd	XMMWORD PTR [RSP + 192], XMM14 # 16-byte Spill

movapd	XMM13, XMMWORD PTR [RSP + 208] # 16-byte Reload

mulpd	XMM13, XMMWORD PTR [RIP + .LCPI2_12]

movapd	XMMWORD PTR [RSP + 208], XMM13 # 16-byte Spill

movapd	XMM12, XMMWORD PTR [RSP + 224] # 16-byte Reload

mulpd	XMM12, XMMWORD PTR [RIP + .LCPI2_13]

movapd	XMMWORD PTR [RSP + 224], XMM12 # 16-byte Spill

movapd	XMM11, XMMWORD PTR [RSP + 240] # 16-byte Reload

mulpd	XMM11, XMM2

movapd	XMMWORD PTR [RSP + 240], XMM11 # 16-byte Spill

movapd	XMM10, XMMWORD PTR [RSP + 48] # 16-byte Reload

mulpd	XMM10, XMM9

movapd	XMM0, XMM10

unpckhpd	XMM0, XMM0      # xmm0 = xmm0[1,1]

unpcklpd	XMM0, XMM3      # xmm0 = xmm0[0],xmm3[0]

movapd	XMMWORD PTR [R14], XMM0

inc	R13

cmp	R13, QWORD PTR [RSP + 480]

jb	.LBB2_1

# BB#2:                                 # %SyncBB29

pop	RBX

pop	R12

pop	R13

pop	R14

pop	R15

pop	RBP

ret

.Ltmp2:

.size	magic_filter5, .Ltmp2-magic_filter5```

As can be seen, the OpenCL version is spilling a lot of registers. This effect halves the performances.
Am I doing something wrong? I would have expected code written in intrinsics to port quite straightforwardly to OpenCL vectorized language. Even though I was curious to see if the compiler would optimize the transposition using haddpd, I did not expect such differences.

Sorry for the long post with lots of code but I could not reduce further.

3 posts / 0 new
For more complete information about compiler optimizations, see our Optimization Notice.

I am still struggling with this problem. Would there be a way in the compiler to bypass most of the LLVM optimizations and go straight to the code generation phase? This way the compiler would just be used to allocate registers, the way it happens when using intrinsics and a standard compiler.

One specific optimization you might want to disable first when dealing with explicitly vectorized code is OCL vectorizer itself.

Consider prefixing your kernel withvec_type_hint:

__kernel __attribute__((vec_type_hint(double2))) foo(...)

This is unlikely to be related to the register allocation though. If you are seeing too much spiils/fills it might indicate that you (for example) are having too much constants in use. Native (not OpenCL, which is just JIT) compiler might handle this situation somewhat better.