Vectorization Problems

Vectorization Problems

I have problems trying to translate code written with intrinsics to OpenCL vector code. I am working with double precision. The code is doing vectorized filtering of data using a filter of length 16.

The original code is (sorry but I could not reduce it further):

const double filter[] __attribute__ ((aligned (16))) = {

                        8.4334247333529341094733325815816e-7,

                       -0.1290557201342060969516786758559028e-4,

                        0.8762984476210559564689161894116397e-4,

                       -0.30158038132690463167163703826169879e-3,

                        0.174723713672993903449447812749852942e-2,

                       -0.942047030201080385922711540948195075e-2,

                        0.2373821463724942397566389712597274535e-1,

                        0.612625895831207982195380597e-1,

                        0.9940415697834003993178616713,

                       -0.604895289196983516002834636e-1,

                       -0.2103025160930381434955489412839065067e-1,

                        0.1337263414854794752733423467013220997e-1,

                       -0.344128144493493857280881509686821861e-2,

                        0.49443227688689919192282259476750972e-3,

                       -0.5185986881173432922848639136911487e-4,

                        2.72734492911979659657715313017228e-6};
const double filter_u[] __attribute__ ((aligned (16))) = {

                        2.72734492911979659657715313017228e-6,

                        8.4334247333529341094733325815816e-7,

                       -0.1290557201342060969516786758559028e-4,

                        0.8762984476210559564689161894116397e-4,

                       -0.30158038132690463167163703826169879e-3,

                        0.174723713672993903449447812749852942e-2,

                       -0.942047030201080385922711540948195075e-2,

                        0.2373821463724942397566389712597274535e-1,

                        0.612625895831207982195380597e-1,

                        0.9940415697834003993178616713,

                       -0.604895289196983516002834636e-1,

                       -0.2103025160930381434955489412839065067e-1,

                        0.1337263414854794752733423467013220997e-1,

                       -0.344128144493493857280881509686821861e-2,

                        0.49443227688689919192282259476750972e-3,

                       -0.5185986881173432922848639136911487e-4,

                        2.72734492911979659657715313017228e-6};
#define conv_4x2_block_fused(offset_filter,offset_source,d00,d10,d20,d30) \

FA = _mm_load_pd(filter+offset_filter);\

d00 = _mm_load_pd(source0+offset_source);\

S00 = _mm_add_pd(S00,_mm_mul_pd(d00,FA));\

FU = _mm_load_pd(filter_u+offset_filter);\

d10 = _mm_load_pd(source1+offset_source);\

S01 = _mm_add_pd(S01,_mm_mul_pd(d00,FU));\

S11 = _mm_add_pd(S11,_mm_mul_pd(d10,FU));\

d20 = _mm_load_pd(source2+offset_source);\

S10 = _mm_add_pd(S10,_mm_mul_pd(d10,FA));\

d30 = _mm_load_pd(source3+offset_source);\

S20 = _mm_add_pd(S20,_mm_mul_pd(d20,FA));\

S30 = _mm_add_pd(S30,_mm_mul_pd(d30,FA));\

S31 = _mm_add_pd(S31,_mm_mul_pd(d30,FU));\

S21 = _mm_add_pd(S21,_mm_mul_pd(d20,FU));
void conv_4x2_fused(size_t ndat, double const * source0, double const * source1,

                                 double const * source2, double const * source3,

                                 double       * dest){

  __m128d S00,S01,S10,S11,S20,S21,S30,S31;

  __m128d FA,FU;

  __m128d D00,D10,D20,D30;
  FA = _mm_load_pd(filter);

  D00 = _mm_load_pd(source0);

  S00 = _mm_mul_pd(D00,FA);

  D10 = _mm_load_pd(source1);

  S10 = _mm_mul_pd(D10,FA);

  D20 = _mm_load_pd(source2);

  S20 = _mm_mul_pd(D20,FA);

  D30 = _mm_load_pd(source3);

  S30 = _mm_mul_pd(D30,FA);
  FU = _mm_load_pd(filter_u);

  S01 = _mm_loadl_pd(D00,source0+16);

  S01 = _mm_mul_pd(S01,FU);

  S11 = _mm_loadl_pd(D10,source1+16);

  S11 = _mm_mul_pd(S11,FU);

  S21 = _mm_loadl_pd(D20,source2+16);

  S21 = _mm_mul_pd(S21,FU);

  S31 = _mm_loadl_pd(D30,source3+16);

  S31 = _mm_mul_pd(S31,FU);
  conv_4x2_block_fused(2,2,D00,D10,D20,D30);

  conv_4x2_block_fused(4,4,D00,D10,D20,D30);

  conv_4x2_block_fused(6,6,D00,D10,D20,D30);

  conv_4x2_block_fused(8,8,D00,D10,D20,D30);

  conv_4x2_block_fused(10,10,D00,D10,D20,D30);

  conv_4x2_block_fused(12,12,D00,D10,D20,D30);

  conv_4x2_block_fused(14,14,D00,D10,D20,D30);
  _mm_store_pd(dest,_mm_hadd_pd(S00,S10));

  _mm_store_pd(dest+2,_mm_hadd_pd(S20,S30));

  _mm_store_pd(dest+ndat,_mm_hadd_pd(S01,S11));

  _mm_store_pd(dest+2+ndat,_mm_hadd_pd(S21,S31));

}


Here is the generated assembly (gcc-4.6, but icc generates similar code):
0000000000408970 :

  408970:	66 0f 28 0e          	movapd (%rsi),%xmm1

  408974:	66 0f 28 1a          	movapd (%rdx),%xmm3

  408978:	66 0f 28 01          	movapd (%rcx),%xmm0

  40897c:	66 0f 28 e9          	movapd %xmm1,%xmm5

  408980:	66 0f 12 8e 80 00 00 	movlpd 0x80(%rsi),%xmm1

  408987:	00

  408988:	66 0f 28 35 80 ee 00 	movapd 0xee80(%rip),%xmm6     # 417810

  40898f:	00

  408990:	66 0f 28 fb          	movapd %xmm3,%xmm7

  408994:	66 0f 12 9a 80 00 00 	movlpd 0x80(%rdx),%xmm3

  40899b:	00

  40899c:	66 41 0f 28 10       	movapd (%r8),%xmm2

  4089a1:	66 0f 28 e0          	movapd %xmm0,%xmm4

  4089a5:	66 0f 12 81 80 00 00 	movlpd 0x80(%rcx),%xmm0

  4089ac:	00

  4089ad:	66 44 0f 28 05 ca ed 	movapd 0xedca(%rip),%xmm8     # 417780

  4089b4:	00 00

  4089b6:	66 0f 59 ee          	mulpd  %xmm6,%xmm5

  4089ba:	66 44 0f 28 56 10    	movapd 0x10(%rsi),%xmm10

  4089c0:	66 0f 59 fe          	mulpd  %xmm6,%xmm7

  4089c4:	66 44 0f 28 0d 53 ee 	movapd 0xee53(%rip),%xmm9     # 417820

  4089cb:	00 00

  4089cd:	66 0f 59 e6          	mulpd  %xmm6,%xmm4

  4089d1:	66 0f 59 f2          	mulpd  %xmm2,%xmm6

  4089d5:	66 44 0f 28 5a 10    	movapd 0x10(%rdx),%xmm11

  4089db:	66 41 0f 12 90 80 00 	movlpd 0x80(%r8),%xmm2

  4089e2:	00 00

  4089e4:	66 41 0f 59 c8       	mulpd  %xmm8,%xmm1

  4089e9:	66 41 0f 59 d8       	mulpd  %xmm8,%xmm3

  4089ee:	66 41 0f 59 c0       	mulpd  %xmm8,%xmm0

  4089f3:	66 41 0f 59 d0       	mulpd  %xmm8,%xmm2

  4089f8:	66 45 0f 28 c2       	movapd %xmm10,%xmm8

  4089fd:	66 45 0f 59 c1       	mulpd  %xmm9,%xmm8

  408a02:	66 41 0f 58 e8       	addpd  %xmm8,%xmm5

  408a07:	66 44 0f 28 05 80 ed 	movapd 0xed80(%rip),%xmm8     # 417790

  408a0e:	00 00

  408a10:	66 45 0f 59 d0       	mulpd  %xmm8,%xmm10

  408a15:	66 41 0f 58 ca       	addpd  %xmm10,%xmm1

  408a1a:	66 45 0f 28 d3       	movapd %xmm11,%xmm10

  408a1f:	66 45 0f 59 d0       	mulpd  %xmm8,%xmm10

  408a24:	66 45 0f 59 d9       	mulpd  %xmm9,%xmm11

  408a29:	66 41 0f 58 da       	addpd  %xmm10,%xmm3

  408a2e:	66 44 0f 28 51 10    	movapd 0x10(%rcx),%xmm10

  408a34:	66 41 0f 58 fb       	addpd  %xmm11,%xmm7

  408a39:	66 45 0f 28 58 10    	movapd 0x10(%r8),%xmm11

  408a3f:	66 45 0f 28 e2       	movapd %xmm10,%xmm12

  408a44:	66 45 0f 59 e1       	mulpd  %xmm9,%xmm12

  408a49:	66 45 0f 59 cb       	mulpd  %xmm11,%xmm9

  408a4e:	66 45 0f 59 d8       	mulpd  %xmm8,%xmm11

  408a53:	66 45 0f 59 c2       	mulpd  %xmm10,%xmm8

  408a58:	66 44 0f 28 56 20    	movapd 0x20(%rsi),%xmm10

  408a5e:	66 41 0f 58 e4       	addpd  %xmm12,%xmm4

  408a63:	66 41 0f 58 f1       	addpd  %xmm9,%xmm6

  408a68:	66 44 0f 28 0d bf ed 	movapd 0xedbf(%rip),%xmm9     # 417830

  408a6f:	00 00

  408a71:	66 41 0f 58 d3       	addpd  %xmm11,%xmm2

  408a76:	66 44 0f 28 5a 20    	movapd 0x20(%rdx),%xmm11

  408a7c:	66 41 0f 58 c0       	addpd  %xmm8,%xmm0

  408a81:	66 45 0f 28 c2       	movapd %xmm10,%xmm8

  408a86:	66 45 0f 59 c1       	mulpd  %xmm9,%xmm8

  408a8b:	66 41 0f 58 e8       	addpd  %xmm8,%xmm5

  408a90:	66 44 0f 28 05 07 ed 	movapd 0xed07(%rip),%xmm8     # 4177a0

  408a97:	00 00

  408a99:	66 45 0f 59 d0       	mulpd  %xmm8,%xmm10

  408a9e:	66 41 0f 58 ca       	addpd  %xmm10,%xmm1

  408aa3:	66 45 0f 28 d3       	movapd %xmm11,%xmm10

  408aa8:	66 45 0f 59 d0       	mulpd  %xmm8,%xmm10

  408aad:	66 45 0f 59 d9       	mulpd  %xmm9,%xmm11

  408ab2:	66 41 0f 58 da       	addpd  %xmm10,%xmm3

  408ab7:	66 44 0f 28 51 20    	movapd 0x20(%rcx),%xmm10

  408abd:	66 41 0f 58 fb       	addpd  %xmm11,%xmm7

  408ac2:	66 45 0f 28 58 20    	movapd 0x20(%r8),%xmm11

  408ac8:	66 45 0f 28 e2       	movapd %xmm10,%xmm12

  408acd:	66 45 0f 59 e1       	mulpd  %xmm9,%xmm12

  408ad2:	66 45 0f 59 cb       	mulpd  %xmm11,%xmm9

  408ad7:	66 45 0f 59 d8       	mulpd  %xmm8,%xmm11

  408adc:	66 45 0f 59 c2       	mulpd  %xmm10,%xmm8

  408ae1:	66 44 0f 28 56 30    	movapd 0x30(%rsi),%xmm10

  408ae7:	66 41 0f 58 e4       	addpd  %xmm12,%xmm4

  408aec:	66 41 0f 58 f1       	addpd  %xmm9,%xmm6

  408af1:	66 44 0f 28 0d 46 ed 	movapd 0xed46(%rip),%xmm9     # 417840

  408af8:	00 00

  408afa:	66 41 0f 58 d3       	addpd  %xmm11,%xmm2

  408aff:	66 44 0f 28 5a 30    	movapd 0x30(%rdx),%xmm11

  408b05:	66 41 0f 58 c0       	addpd  %xmm8,%xmm0

  408b0a:	66 45 0f 28 c2       	movapd %xmm10,%xmm8

  408b0f:	66 45 0f 59 c1       	mulpd  %xmm9,%xmm8

  408b14:	66 41 0f 58 e8       	addpd  %xmm8,%xmm5

  408b19:	66 44 0f 28 05 8e ec 	movapd 0xec8e(%rip),%xmm8     # 4177b0

  408b20:	00 00

  408b22:	66 45 0f 59 d0       	mulpd  %xmm8,%xmm10

  408b27:	66 41 0f 58 ca       	addpd  %xmm10,%xmm1

  408b2c:	66 45 0f 28 d3       	movapd %xmm11,%xmm10

  408b31:	66 45 0f 59 d0       	mulpd  %xmm8,%xmm10

  408b36:	66 45 0f 59 d9       	mulpd  %xmm9,%xmm11

  408b3b:	66 41 0f 58 da       	addpd  %xmm10,%xmm3

  408b40:	66 44 0f 28 51 30    	movapd 0x30(%rcx),%xmm10

  408b46:	66 41 0f 58 fb       	addpd  %xmm11,%xmm7

  408b4b:	66 45 0f 28 58 30    	movapd 0x30(%r8),%xmm11

  408b51:	66 45 0f 28 e2       	movapd %xmm10,%xmm12

  408b56:	66 45 0f 59 e1       	mulpd  %xmm9,%xmm12

  408b5b:	66 45 0f 59 cb       	mulpd  %xmm11,%xmm9

  408b60:	66 45 0f 59 d8       	mulpd  %xmm8,%xmm11

  408b65:	66 45 0f 59 c2       	mulpd  %xmm10,%xmm8

  408b6a:	66 44 0f 28 56 40    	movapd 0x40(%rsi),%xmm10

  408b70:	66 41 0f 58 e4       	addpd  %xmm12,%xmm4

  408b75:	66 41 0f 58 f1       	addpd  %xmm9,%xmm6

  408b7a:	66 44 0f 28 0d cd ec 	movapd 0xeccd(%rip),%xmm9     # 417850

  408b81:	00 00

  408b83:	66 41 0f 58 d3       	addpd  %xmm11,%xmm2

  408b88:	66 44 0f 28 5a 40    	movapd 0x40(%rdx),%xmm11

  408b8e:	66 41 0f 58 c0       	addpd  %xmm8,%xmm0

  408b93:	66 45 0f 28 c2       	movapd %xmm10,%xmm8

  408b98:	66 45 0f 59 c1       	mulpd  %xmm9,%xmm8

  408b9d:	66 41 0f 58 e8       	addpd  %xmm8,%xmm5

  408ba2:	66 44 0f 28 05 15 ec 	movapd 0xec15(%rip),%xmm8     # 4177c0

  408ba9:	00 00

  408bab:	66 45 0f 59 d0       	mulpd  %xmm8,%xmm10

  408bb0:	66 41 0f 58 ca       	addpd  %xmm10,%xmm1

  408bb5:	66 45 0f 28 d3       	movapd %xmm11,%xmm10

  408bba:	66 45 0f 59 d0       	mulpd  %xmm8,%xmm10

  408bbf:	66 45 0f 59 d9       	mulpd  %xmm9,%xmm11

  408bc4:	66 41 0f 58 da       	addpd  %xmm10,%xmm3

  408bc9:	66 44 0f 28 51 40    	movapd 0x40(%rcx),%xmm10

  408bcf:	66 41 0f 58 fb       	addpd  %xmm11,%xmm7

  408bd4:	66 45 0f 28 58 40    	movapd 0x40(%r8),%xmm11

  408bda:	66 45 0f 28 e2       	movapd %xmm10,%xmm12

  408bdf:	66 45 0f 59 e1       	mulpd  %xmm9,%xmm12

  408be4:	66 45 0f 59 cb       	mulpd  %xmm11,%xmm9

  408be9:	66 45 0f 59 d8       	mulpd  %xmm8,%xmm11

  408bee:	66 45 0f 59 c2       	mulpd  %xmm10,%xmm8

  408bf3:	66 44 0f 28 56 50    	movapd 0x50(%rsi),%xmm10

  408bf9:	66 41 0f 58 e4       	addpd  %xmm12,%xmm4

  408bfe:	66 41 0f 58 f1       	addpd  %xmm9,%xmm6

  408c03:	66 44 0f 28 0d 54 ec 	movapd 0xec54(%rip),%xmm9     # 417860

  408c0a:	00 00

  408c0c:	66 41 0f 58 d3       	addpd  %xmm11,%xmm2

  408c11:	66 44 0f 28 5a 50    	movapd 0x50(%rdx),%xmm11

  408c17:	66 41 0f 58 c0       	addpd  %xmm8,%xmm0

  408c1c:	66 45 0f 28 c2       	movapd %xmm10,%xmm8

  408c21:	66 45 0f 59 c1       	mulpd  %xmm9,%xmm8

  408c26:	66 41 0f 58 e8       	addpd  %xmm8,%xmm5

  408c2b:	66 44 0f 28 05 9c eb 	movapd 0xeb9c(%rip),%xmm8     # 4177d0

  408c32:	00 00

  408c34:	66 45 0f 59 d0       	mulpd  %xmm8,%xmm10

  408c39:	66 41 0f 58 ca       	addpd  %xmm10,%xmm1

  408c3e:	66 45 0f 28 d3       	movapd %xmm11,%xmm10

  408c43:	66 45 0f 59 d0       	mulpd  %xmm8,%xmm10

  408c48:	66 45 0f 59 d9       	mulpd  %xmm9,%xmm11

  408c4d:	66 41 0f 58 da       	addpd  %xmm10,%xmm3

  408c52:	66 44 0f 28 51 50    	movapd 0x50(%rcx),%xmm10

  408c58:	66 41 0f 58 fb       	addpd  %xmm11,%xmm7

  408c5d:	66 45 0f 28 58 50    	movapd 0x50(%r8),%xmm11

  408c63:	66 45 0f 28 e2       	movapd %xmm10,%xmm12

  408c68:	66 45 0f 59 e1       	mulpd  %xmm9,%xmm12

  408c6d:	66 45 0f 59 cb       	mulpd  %xmm11,%xmm9

  408c72:	66 45 0f 59 d8       	mulpd  %xmm8,%xmm11

  408c77:	66 45 0f 59 c2       	mulpd  %xmm10,%xmm8

  408c7c:	66 44 0f 28 56 60    	movapd 0x60(%rsi),%xmm10

  408c82:	66 41 0f 58 e4       	addpd  %xmm12,%xmm4

  408c87:	66 41 0f 58 f1       	addpd  %xmm9,%xmm6

  408c8c:	66 44 0f 28 0d db eb 	movapd 0xebdb(%rip),%xmm9     # 417870

  408c93:	00 00

  408c95:	66 41 0f 58 d3       	addpd  %xmm11,%xmm2

  408c9a:	66 44 0f 28 5a 60    	movapd 0x60(%rdx),%xmm11

  408ca0:	66 41 0f 58 c0       	addpd  %xmm8,%xmm0

  408ca5:	66 45 0f 28 c2       	movapd %xmm10,%xmm8

  408caa:	66 45 0f 59 c1       	mulpd  %xmm9,%xmm8

  408caf:	66 41 0f 58 e8       	addpd  %xmm8,%xmm5

  408cb4:	66 44 0f 28 05 23 eb 	movapd 0xeb23(%rip),%xmm8     # 4177e0

  408cbb:	00 00

  408cbd:	66 45 0f 59 d0       	mulpd  %xmm8,%xmm10

  408cc2:	66 41 0f 58 ca       	addpd  %xmm10,%xmm1

  408cc7:	66 45 0f 28 d3       	movapd %xmm11,%xmm10

  408ccc:	66 45 0f 59 d0       	mulpd  %xmm8,%xmm10

  408cd1:	66 45 0f 59 d9       	mulpd  %xmm9,%xmm11

  408cd6:	66 41 0f 58 da       	addpd  %xmm10,%xmm3

  408cdb:	66 44 0f 28 51 60    	movapd 0x60(%rcx),%xmm10

  408ce1:	66 41 0f 58 fb       	addpd  %xmm11,%xmm7

  408ce6:	66 45 0f 28 58 60    	movapd 0x60(%r8),%xmm11

  408cec:	66 45 0f 28 e2       	movapd %xmm10,%xmm12

  408cf1:	66 45 0f 59 e1       	mulpd  %xmm9,%xmm12

  408cf6:	66 45 0f 59 cb       	mulpd  %xmm11,%xmm9

  408cfb:	66 45 0f 59 d8       	mulpd  %xmm8,%xmm11

  408d00:	66 45 0f 59 c2       	mulpd  %xmm10,%xmm8

  408d05:	66 44 0f 28 56 70    	movapd 0x70(%rsi),%xmm10

  408d0b:	66 41 0f 58 e4       	addpd  %xmm12,%xmm4

  408d10:	66 41 0f 58 f1       	addpd  %xmm9,%xmm6

  408d15:	66 44 0f 28 0d 62 eb 	movapd 0xeb62(%rip),%xmm9     # 417880

  408d1c:	00 00

  408d1e:	66 41 0f 58 d3       	addpd  %xmm11,%xmm2

  408d23:	66 44 0f 28 5a 70    	movapd 0x70(%rdx),%xmm11

  408d29:	66 41 0f 58 c0       	addpd  %xmm8,%xmm0

  408d2e:	66 45 0f 28 c2       	movapd %xmm10,%xmm8

  408d33:	66 45 0f 59 c1       	mulpd  %xmm9,%xmm8

  408d38:	66 41 0f 58 e8       	addpd  %xmm8,%xmm5

  408d3d:	66 44 0f 28 05 aa ea 	movapd 0xeaaa(%rip),%xmm8     # 4177f0

  408d44:	00 00

  408d46:	66 45 0f 59 d0       	mulpd  %xmm8,%xmm10

  408d4b:	66 41 0f 58 ca       	addpd  %xmm10,%xmm1

  408d50:	66 45 0f 28 d3       	movapd %xmm11,%xmm10

  408d55:	66 45 0f 59 d0       	mulpd  %xmm8,%xmm10

  408d5a:	66 45 0f 59 d9       	mulpd  %xmm9,%xmm11

  408d5f:	66 41 0f 58 da       	addpd  %xmm10,%xmm3

  408d64:	66 44 0f 28 51 70    	movapd 0x70(%rcx),%xmm10

  408d6a:	66 41 0f 58 fb       	addpd  %xmm11,%xmm7

  408d6f:	66 45 0f 28 58 70    	movapd 0x70(%r8),%xmm11

  408d75:	66 45 0f 28 e2       	movapd %xmm10,%xmm12

  408d7a:	66 45 0f 59 e1       	mulpd  %xmm9,%xmm12

  408d7f:	66 0f 7c cb          	haddpd %xmm3,%xmm1

  408d83:	66 45 0f 59 cb       	mulpd  %xmm11,%xmm9

  408d88:	66 0f 7c ef          	haddpd %xmm7,%xmm5

  408d8c:	66 45 0f 59 d8       	mulpd  %xmm8,%xmm11

  408d91:	66 45 0f 59 c2       	mulpd  %xmm10,%xmm8

  408d96:	66 41 0f 29 29       	movapd %xmm5,(%r9)

  408d9b:	66 41 0f 58 e4       	addpd  %xmm12,%xmm4

  408da0:	66 41 0f 58 f1       	addpd  %xmm9,%xmm6

  408da5:	66 41 0f 58 d3       	addpd  %xmm11,%xmm2

  408daa:	66 41 0f 58 c0       	addpd  %xmm8,%xmm0

  408daf:	66 0f 7c e6          	haddpd %xmm6,%xmm4

  408db3:	66 0f 7c c2          	haddpd %xmm2,%xmm0

  408db7:	66 41 0f 29 61 10    	movapd %xmm4,0x10(%r9)

  408dbd:	66 41 0f 29 0c f9    	movapd %xmm1,(%r9,%rdi,8)

  408dc3:	66 41 0f 29 44 f9 10 	movapd %xmm0,0x10(%r9,%rdi,8)

  408dca:	c3                   	retq

  408dcb:	0f 1f 44 00 00       	nopl   0x0(%rax,%rax,1)


Here is the OpenCL equivalent code:

#ifdef cl_khr_fp64

#pragma OPENCL EXTENSION cl_khr_fp64 : enable

#endif
const double2 FA[8] = {

(double2)(8.4334247333529341094733325815816e-7,

-0.1290557201342060969516786758559028e-4),

(double2)(0.8762984476210559564689161894116397e-4,

-0.30158038132690463167163703826169879e-3),

(double2)(0.174723713672993903449447812749852942e-2,

-0.942047030201080385922711540948195075e-2),

(double2)(0.2373821463724942397566389712597274535e-1,

0.612625895831207982195380597e-1),

(double2)(0.9940415697834003993178616713,

-0.604895289196983516002834636e-1),

(double2)(-0.2103025160930381434955489412839065067e-1,

0.1337263414854794752733423467013220997e-1),

(double2)(-0.344128144493493857280881509686821861e-2,

0.49443227688689919192282259476750972e-3),

(double2)(-0.5185986881173432922848639136911487e-4,

2.72734492911979659657715313017228e-6)};

const double2 FU[8] = {

(double2)(2.72734492911979659657715313017228e-6,

8.4334247333529341094733325815816e-7),

(double2)(-0.1290557201342060969516786758559028e-4,

0.8762984476210559564689161894116397e-4),

(double2)(-0.30158038132690463167163703826169879e-3,

0.174723713672993903449447812749852942e-2),

(double2)(-0.942047030201080385922711540948195075e-2,

0.2373821463724942397566389712597274535e-1),

(double2)(0.612625895831207982195380597e-1,

0.9940415697834003993178616713),

(double2)(-0.604895289196983516002834636e-1,

-0.2103025160930381434955489412839065067e-1),

(double2)(0.1337263414854794752733423467013220997e-1,

-0.344128144493493857280881509686821861e-2),

(double2)(0.49443227688689919192282259476750972e-3,

-0.5185986881173432922848639136911487e-4)};
#define conv_4x2_block_fused(o_f,o_s,d00,d10,d20,d30) 

d00 = in0[o_s];

A = FA[o_f];

S00 += d00 * A;

U = FU[o_f];

d10 = in1[o_s];

S01 += d00 * U;

S11 += d10 * U;

d20 = in2[o_s];

S10 += d10 * A;

d30 = in3[o_s];

S20 += d20 * A;

S30 += d30 * A;

S31 += d30 * U;

S21 += d20 * U;
__kernel void magic_filter5(const ulong n, const ulong ndat, __global const double2 *in, __global double2 *out) {

  const unsigned int n2 = n/2;

  const unsigned int ndat2 = ndat/2;

  __global const double2 *in0 = in;

  __global const double2 *in1 = in0 + n2;

  __global const double2 *in2 = in1 + n2;

  __global const double2 *in3 = in2 + n2;

  __global double2 *out = out + 4;

  double2 D00,D10,D20,D30;

  double2 S00,S01,S10,S11,S20,S21,S30,S31;

  double2 A, U;
    A = FA[0];

    D00 = in0[0];

    S00 = D00 * A;

    D10 = in1[0];

    S10 = D10 * A;

    D20 = in2[0];

    S20 = D20 * A;

    D30 = in3[0];

    S30 = D30 * A;
    U = FU[0];

    D00.x = in0[8].x;

    S01 = D00 * U;

    D10.x = in1[8].x;

    S11 = D10 * U;

    D20.x = in2[8].x;

    S21 = D20 * U;

    D30.x = in3[8].x;

    S31 = D30 * U;
    conv_4x2_block_fused(1,1,D00,D10,D20,D30);

    conv_4x2_block_fused(2,2,D00,D10,D20,D30);

    conv_4x2_block_fused(3,3,D00,D10,D20,D30);

    conv_4x2_block_fused(4,4,D00,D10,D20,D30);

    conv_4x2_block_fused(5,5,D00,D10,D20,D30);

    conv_4x2_block_fused(6,6,D00,D10,D20,D30);

    conv_4x2_block_fused(7,7,D00,D10,D20,D30);
    out[0] = (double2)(S00.x + S00.y, S10.x + S10.y);

    out[1] = (double2)(S20.x + S20.y, S30.x + S30.y);

    out[ndat2 + 0] = (double2)(S01.x + S01.y, S11.x + S11.y);

    out[ndat2 + 1] = (double2)(S21.x + S21.y, S31.x + S31.y);
}

And here is the generated assembly:

	.section	.rodata.cst16,"aM",@progbits,16

	.align	16

.LCPI2_0:                               # constant pool <2 x double>

	.quad	4546094365667641806     # double 8.762984e-05

	.quad	-4669173237098585703    # double -3.015804e-04

.LCPI2_1:                               # constant pool <2 x double>

	.quad	4516068371457184450     # double 8.433425e-07

	.quad	-4689636306257622577    # double -1.290557e-05

.LCPI2_2:                               # constant pool <2 x double>

	.quad	4565700531973276514     # double 1.747237e-03

	.quad	-4646787883676373161    # double -9.420470e-03

.LCPI2_3:                               # constant pool <2 x double>

	.quad	4582499295942488407     # double 2.373821e-02

	.quad	4588989690655579855     # double 6.126259e-02

.LCPI2_4:                               # constant pool <2 x double>

	.quad	4607128750031811026     # double 9.940416e-01

	.quad	-4634493755982099125    # double -6.048953e-02

.LCPI2_5:                               # constant pool <2 x double>

	.quad	-4641653258114437092    # double -2.103025e-02

	.quad	4578862420522603841     # double 1.337263e-02

.LCPI2_6:                               # constant pool <2 x double>

	.quad	-4653290559565923332    # double -3.441281e-03

	.quad	4557699556108328520     # double 4.944323e-04

.LCPI2_7:                               # constant pool <2 x double>

	.quad	-4680594046271356616    # double -5.185987e-05

	.quad	4523550187392857396     # double 2.727345e-06

.LCPI2_8:                               # constant pool <2 x double>

	.quad	-4689636306257622577    # double -1.290557e-05

	.quad	4546094365667641806     # double 8.762984e-05

.LCPI2_9:                               # constant pool <2 x double>

	.quad	4523550187392857396     # double 2.727345e-06

	.quad	4516068371457184450     # double 8.433425e-07

.LCPI2_10:                              # constant pool <2 x double>

	.quad	-4669173237098585703    # double -3.015804e-04

	.quad	4565700531973276514     # double 1.747237e-03

.LCPI2_11:                              # constant pool <2 x double>

	.quad	-4646787883676373161    # double -9.420470e-03

	.quad	4582499295942488407     # double 2.373821e-02

.LCPI2_12:                              # constant pool <2 x double>

	.quad	4588989690655579855     # double 6.126259e-02

	.quad	4607128750031811026     # double 9.940416e-01

.LCPI2_13:                              # constant pool <2 x double>

	.quad	-4634493755982099125    # double -6.048953e-02

	.quad	-4641653258114437092    # double -2.103025e-02

.LCPI2_14:                              # constant pool <2 x double>

	.quad	4578862420522603841     # double 1.337263e-02

	.quad	-4653290559565923332    # double -3.441281e-03

.LCPI2_15:                              # constant pool <2 x double>

	.quad	4557699556108328520     # double 4.944323e-04

	.quad	-4680594046271356616    # double -5.185987e-05

	.text

	.globl	magic_filter5

	.align	16, 0x90

	.type	magic_filter5,@function

magic_filter5:                          # @magic_filter5

# BB#0:                                 # %FirstBB

	push	RBP

	push	R15

	push	R14

	push	R13

	push	R12

	push	RBX

	sub	RSP, 344

	mov	RAX, QWORD PTR [RSP + 400]

	shr	RAX

	mov	EAX, EAX

	lea	RCX, QWORD PTR [RAX + 2*RAX]

	mov	RDX, RAX

	shl	RDX, 5

	mov	RSI, QWORD PTR [RSP + 416]

	lea	RDI, QWORD PTR [RDX + RSI + 128]

	mov	QWORD PTR [RSP + 8], RDI # 8-byte Spill

	lea	RDI, QWORD PTR [RSI + RDX + 16]

	mov	QWORD PTR [RSP], RDI    # 8-byte Spill

	lea	RDI, QWORD PTR [RDX + RSI + 32]

	mov	QWORD PTR [RSP - 8], RDI # 8-byte Spill

	lea	RDI, QWORD PTR [RDX + RSI + 48]

	mov	QWORD PTR [RSP - 16], RDI # 8-byte Spill

	lea	RDI, QWORD PTR [RDX + RSI + 64]

	mov	QWORD PTR [RSP - 24], RDI # 8-byte Spill

	lea	RDI, QWORD PTR [RDX + RSI + 80]

	mov	QWORD PTR [RSP - 32], RDI # 8-byte Spill

	lea	RDI, QWORD PTR [RDX + RSI + 96]

	mov	QWORD PTR [RSP - 40], RDI # 8-byte Spill

	lea	RDI, QWORD PTR [RDX + RSI + 112]

	mov	QWORD PTR [RSP - 48], RDI # 8-byte Spill

	add	RDX, RSI

	shl	RAX, 4

	lea	RDI, QWORD PTR [RSI + RAX + 128]

	mov	QWORD PTR [RSP - 56], RDI # 8-byte Spill

	lea	RDI, QWORD PTR [RSI + RAX + 16]

	mov	QWORD PTR [RSP - 64], RDI # 8-byte Spill

	lea	RDI, QWORD PTR [RSI + RAX + 32]

	mov	QWORD PTR [RSP - 72], RDI # 8-byte Spill

	lea	RDI, QWORD PTR [RSI + RAX + 48]

	mov	QWORD PTR [RSP - 80], RDI # 8-byte Spill

	lea	RDI, QWORD PTR [RSI + RAX + 64]

	mov	QWORD PTR [RSP - 88], RDI # 8-byte Spill

	lea	RDI, QWORD PTR [RSI + RAX + 80]

	mov	QWORD PTR [RSP - 96], RDI # 8-byte Spill

	lea	RDI, QWORD PTR [RSI + RAX + 96]

	mov	QWORD PTR [RSP - 104], RDI # 8-byte Spill

	lea	RDI, QWORD PTR [RSI + RAX + 112]

	mov	QWORD PTR [RSP - 112], RDI # 8-byte Spill

	shl	RCX, 4

	lea	RDI, QWORD PTR [RSI + RCX + 128]

	mov	QWORD PTR [RSP - 120], RDI # 8-byte Spill

	lea	RDI, QWORD PTR [RSI + RCX + 16]

	mov	QWORD PTR [RSP - 128], RDI # 8-byte Spill

	lea	RDI, QWORD PTR [RSI + RCX + 32]

	lea	R8, QWORD PTR [RSI + RCX + 48]

	lea	R9, QWORD PTR [RSI + RCX + 64]

	lea	R10, QWORD PTR [RSI + RCX + 80]

	lea	R11, QWORD PTR [RSI + RCX + 96]

	lea	RBX, QWORD PTR [RSI + RCX + 112]

	mov	R14, QWORD PTR [RSP + 408]

	movabs	R15, 8589934590

	and	R15, R14

	mov	R12, QWORD PTR [RSP + 424]

	lea	R15, QWORD PTR [R12 + 8*R15 + 64]

	shr	R14

	inc	R14D

	shl	R14, 4

	lea	R14, QWORD PTR [R14 + R12 + 64]

	add	RCX, RSI

	add	RAX, RSI

	mov	R13, -1

	.align	16, 0x90

.LBB2_1:                                # %SyncBB

                                        # =>This Inner Loop Header: Depth=1

	mov	RBP, QWORD PTR [RSP - 64] # 8-byte Reload

	movapd	XMM0, XMMWORD PTR [RBP]

	movapd	XMMWORD PTR [RSP + 304], XMM0 # 16-byte Spill

	movapd	XMM1, XMM0

	mulpd	XMM1, XMMWORD PTR [RIP + .LCPI2_0]

	movapd	XMM2, XMMWORD PTR [RAX]

	movapd	XMM3, XMM2

	mulpd	XMM3, XMMWORD PTR [RIP + .LCPI2_1]

	addpd	XMM3, XMM1

	mov	RBP, QWORD PTR [RSP - 72] # 8-byte Reload

	movapd	XMM0, XMMWORD PTR [RBP]

	movapd	XMMWORD PTR [RSP + 64], XMM0 # 16-byte Spill

	movapd	XMM1, XMM0

	mulpd	XMM1, XMMWORD PTR [RIP + .LCPI2_2]

	addpd	XMM1, XMM3

	mov	RBP, QWORD PTR [RSP - 80] # 8-byte Reload

	movapd	XMM0, XMMWORD PTR [RBP]

	movapd	XMMWORD PTR [RSP + 288], XMM0 # 16-byte Spill

	movapd	XMM3, XMM0

	mulpd	XMM3, XMMWORD PTR [RIP + .LCPI2_3]

	addpd	XMM3, XMM1

	mov	RBP, QWORD PTR [RSP - 88] # 8-byte Reload

	movapd	XMM0, XMMWORD PTR [RBP]

	movapd	XMMWORD PTR [RSP + 256], XMM0 # 16-byte Spill

	movapd	XMM1, XMM0

	mulpd	XMM1, XMMWORD PTR [RIP + .LCPI2_4]

	addpd	XMM1, XMM3

	mov	RBP, QWORD PTR [RSP - 96] # 8-byte Reload

	movapd	XMM0, XMMWORD PTR [RBP]

	movapd	XMMWORD PTR [RSP + 144], XMM0 # 16-byte Spill

	movapd	XMM3, XMM0

	mulpd	XMM3, XMMWORD PTR [RIP + .LCPI2_5]

	addpd	XMM3, XMM1

	mov	RBP, QWORD PTR [RSP - 104] # 8-byte Reload

	movapd	XMM0, XMMWORD PTR [RBP]

	movapd	XMMWORD PTR [RSP + 128], XMM0 # 16-byte Spill

	movapd	XMM1, XMM0

	mulpd	XMM1, XMMWORD PTR [RIP + .LCPI2_6]

	addpd	XMM1, XMM3

	mov	RBP, QWORD PTR [RSP - 112] # 8-byte Reload

	movapd	XMM3, XMMWORD PTR [RBP]

	movapd	XMM4, XMM3

	mulpd	XMM4, XMMWORD PTR [RIP + .LCPI2_7]

	addpd	XMM4, XMM1

	movapd	XMM1, XMM4

	unpckhpd	XMM1, XMM1      # xmm1 = xmm1[1,1]

	addsd	XMM1, XMM4

	movapd	XMM4, XMMWORD PTR [RSI]

	movapd	XMM5, XMMWORD PTR [RSI + 16]

	movapd	XMMWORD PTR [RSP + 320], XMM5 # 16-byte Spill

	movapd	XMM0, XMMWORD PTR [RSI + 32]

	movapd	XMMWORD PTR [RSP + 112], XMM0 # 16-byte Spill

	movapd	XMM5, XMMWORD PTR [RSI + 48]

	movapd	XMMWORD PTR [RSP + 272], XMM5 # 16-byte Spill

	movapd	XMM6, XMMWORD PTR [RSP + 320] # 16-byte Reload

	mulpd	XMM6, XMMWORD PTR [RIP + .LCPI2_0]

	movapd	XMM7, XMM4

	mulpd	XMM7, XMMWORD PTR [RIP + .LCPI2_1]

	addpd	XMM7, XMM6

	movapd	XMM6, XMM0

	mulpd	XMM6, XMMWORD PTR [RIP + .LCPI2_2]

	addpd	XMM6, XMM7

	movapd	XMM7, XMM5

	mulpd	XMM7, XMMWORD PTR [RIP + .LCPI2_3]

	addpd	XMM7, XMM6

	movapd	XMM0, XMMWORD PTR [RSI + 64]

	movapd	XMMWORD PTR [RSP + 96], XMM0 # 16-byte Spill

	movapd	XMM5, XMM0

	mulpd	XMM5, XMMWORD PTR [RIP + .LCPI2_4]

	addpd	XMM5, XMM7

	movapd	XMM0, XMMWORD PTR [RSI + 80]

	movapd	XMMWORD PTR [RSP + 80], XMM0 # 16-byte Spill

	movapd	XMM6, XMM0

	mulpd	XMM6, XMMWORD PTR [RIP + .LCPI2_5]

	addpd	XMM6, XMM5

	movapd	XMM0, XMMWORD PTR [RSI + 96]

	movapd	XMMWORD PTR [RSP + 16], XMM0 # 16-byte Spill

	movapd	XMM5, XMM0

	mulpd	XMM5, XMMWORD PTR [RIP + .LCPI2_6]

	addpd	XMM5, XMM6

	movapd	XMM6, XMMWORD PTR [RSI + 112]

	movapd	XMM7, XMM6

	mulpd	XMM7, XMMWORD PTR [RIP + .LCPI2_7]

	addpd	XMM7, XMM5

	movapd	XMM5, XMM7

	unpckhpd	XMM5, XMM5      # xmm5 = xmm5[1,1]

	addsd	XMM5, XMM7

	unpcklpd	XMM5, XMM1      # xmm5 = xmm5[0],xmm1[0]

	movapd	XMM1, XMMWORD PTR [RDX]

	movapd	XMM7, XMM1

	mulpd	XMM7, XMMWORD PTR [RIP + .LCPI2_1]

	mov	RBP, QWORD PTR [RSP + 8] # 8-byte Reload

	movlpd	XMM1, QWORD PTR [RBP]

	movapd	XMM8, XMMWORD PTR [RCX]

	movapd	XMM9, XMM8

	mulpd	XMM9, XMMWORD PTR [RIP + .LCPI2_1]

	mov	RBP, QWORD PTR [RSP - 120] # 8-byte Reload

	movlpd	XMM8, QWORD PTR [RBP]

	movapd	XMM10, XMMWORD PTR [RSP + 320] # 16-byte Reload

	mulpd	XMM10, XMMWORD PTR [RIP + .LCPI2_8]

	movapd	XMMWORD PTR [RSP + 320], XMM10 # 16-byte Spill

	movlpd	XMM4, QWORD PTR [RSI + 128]

	mov	RBP, QWORD PTR [RSP - 56] # 8-byte Reload

	movlpd	XMM2, QWORD PTR [RBP]

	mov	RBP, QWORD PTR [RSP - 48] # 8-byte Reload

	movapd	XMM10, XMMWORD PTR [RBP]

	movapd	XMMWORD PTR [RSP + 48], XMM10 # 16-byte Spill

	mov	RBP, QWORD PTR [RSP - 40] # 8-byte Reload

	movapd	XMM11, XMMWORD PTR [RBP]

	movapd	XMMWORD PTR [RSP + 240], XMM11 # 16-byte Spill

	mov	RBP, QWORD PTR [RSP - 32] # 8-byte Reload

	movapd	XMM12, XMMWORD PTR [RBP]

	movapd	XMMWORD PTR [RSP + 224], XMM12 # 16-byte Spill

	mov	RBP, QWORD PTR [RSP - 24] # 8-byte Reload

	movapd	XMM13, XMMWORD PTR [RBP]

	movapd	XMMWORD PTR [RSP + 208], XMM13 # 16-byte Spill

	mov	RBP, QWORD PTR [RSP - 16] # 8-byte Reload

	movapd	XMM14, XMMWORD PTR [RBP]

	movapd	XMMWORD PTR [RSP + 192], XMM14 # 16-byte Spill

	mov	RBP, QWORD PTR [RSP - 8] # 8-byte Reload

	movapd	XMM15, XMMWORD PTR [RBP]

	movapd	XMMWORD PTR [RSP + 160], XMM15 # 16-byte Spill

	mov	RBP, QWORD PTR [RSP]    # 8-byte Reload

	movapd	XMM0, XMMWORD PTR [RBP]

	movapd	XMMWORD PTR [RSP + 176], XMM0 # 16-byte Spill

	movapd	XMM0, XMMWORD PTR [RBX]

	movapd	XMM10, XMMWORD PTR [R11]

	movapd	XMM11, XMMWORD PTR [R10]

	movapd	XMM12, XMMWORD PTR [R9]

	movapd	XMM13, XMMWORD PTR [R8]

	movapd	XMM14, XMMWORD PTR [RDI]

	mov	RBP, QWORD PTR [RSP - 128] # 8-byte Reload

	movapd	XMM15, XMMWORD PTR [RBP]

	movapd	XMMWORD PTR [RSP + 32], XMM15 # 16-byte Spill

	movapd	XMMWORD PTR [R12 + 64], XMM5

	movapd	XMM5, XMM15

	mulpd	XMM5, XMMWORD PTR [RIP + .LCPI2_0]

	addpd	XMM5, XMM9

	movapd	XMM9, XMM14

	mulpd	XMM9, XMMWORD PTR [RIP + .LCPI2_2]

	addpd	XMM9, XMM5

	movapd	XMM5, XMM13

	mulpd	XMM5, XMMWORD PTR [RIP + .LCPI2_3]

	addpd	XMM5, XMM9

	movapd	XMM9, XMM12

	mulpd	XMM9, XMMWORD PTR [RIP + .LCPI2_4]

	addpd	XMM9, XMM5

	movapd	XMM5, XMM11

	mulpd	XMM5, XMMWORD PTR [RIP + .LCPI2_5]

	addpd	XMM5, XMM9

	movapd	XMM9, XMM10

	mulpd	XMM9, XMMWORD PTR [RIP + .LCPI2_6]

	addpd	XMM9, XMM5

	movapd	XMM5, XMM0

	mulpd	XMM5, XMMWORD PTR [RIP + .LCPI2_7]

	addpd	XMM5, XMM9

	movapd	XMM9, XMM5

	unpckhpd	XMM9, XMM9      # xmm9 = xmm9[1,1]

	addsd	XMM9, XMM5

	movapd	XMM5, XMMWORD PTR [RSP + 176] # 16-byte Reload

	mulpd	XMM5, XMMWORD PTR [RIP + .LCPI2_0]

	addpd	XMM5, XMM7

	movapd	XMM7, XMMWORD PTR [RSP + 160] # 16-byte Reload

	mulpd	XMM7, XMMWORD PTR [RIP + .LCPI2_2]

	addpd	XMM7, XMM5

	movapd	XMM5, XMMWORD PTR [RSP + 192] # 16-byte Reload

	mulpd	XMM5, XMMWORD PTR [RIP + .LCPI2_3]

	addpd	XMM5, XMM7

	movapd	XMM7, XMMWORD PTR [RSP + 208] # 16-byte Reload

	mulpd	XMM7, XMMWORD PTR [RIP + .LCPI2_4]

	addpd	XMM7, XMM5

	movapd	XMM5, XMMWORD PTR [RSP + 224] # 16-byte Reload

	mulpd	XMM5, XMMWORD PTR [RIP + .LCPI2_5]

	addpd	XMM5, XMM7

	movapd	XMM7, XMMWORD PTR [RSP + 240] # 16-byte Reload

	mulpd	XMM7, XMMWORD PTR [RIP + .LCPI2_6]

	addpd	XMM7, XMM5

	movapd	XMM5, XMMWORD PTR [RSP + 48] # 16-byte Reload

	mulpd	XMM5, XMMWORD PTR [RIP + .LCPI2_7]

	addpd	XMM5, XMM7

	movapd	XMM7, XMM5

	unpckhpd	XMM7, XMM7      # xmm7 = xmm7[1,1]

	addsd	XMM7, XMM5

	unpcklpd	XMM7, XMM9      # xmm7 = xmm7[0],xmm9[0]

	movapd	XMMWORD PTR [R12 + 80], XMM7

	movapd	XMM5, XMMWORD PTR [RSP + 304] # 16-byte Reload

	mulpd	XMM5, XMMWORD PTR [RIP + .LCPI2_8]

	movapd	XMMWORD PTR [RSP + 304], XMM5 # 16-byte Spill

	movapd	XMM5, XMMWORD PTR [RIP + .LCPI2_9]

	mulpd	XMM2, XMM5

	addpd	XMM2, XMMWORD PTR [RSP + 304] # 16-byte Folded Reload

	movapd	XMM7, XMMWORD PTR [RIP + .LCPI2_10]

	movapd	XMM9, XMMWORD PTR [RSP + 64] # 16-byte Reload

	mulpd	XMM9, XMM7

	addpd	XMM9, XMM2

	movapd	XMMWORD PTR [RSP + 64], XMM9 # 16-byte Spill

	movapd	XMM2, XMMWORD PTR [RSP + 288] # 16-byte Reload

	mulpd	XMM2, XMMWORD PTR [RIP + .LCPI2_11]

	addpd	XMM2, XMM9

	movapd	XMMWORD PTR [RSP + 288], XMM2 # 16-byte Spill

	movapd	XMM2, XMMWORD PTR [RSP + 256] # 16-byte Reload

	mulpd	XMM2, XMMWORD PTR [RIP + .LCPI2_12]

	addpd	XMM2, XMMWORD PTR [RSP + 288] # 16-byte Folded Reload

	movapd	XMMWORD PTR [RSP + 256], XMM2 # 16-byte Spill

	movapd	XMM2, XMMWORD PTR [RSP + 144] # 16-byte Reload

	mulpd	XMM2, XMMWORD PTR [RIP + .LCPI2_13]

	addpd	XMM2, XMMWORD PTR [RSP + 256] # 16-byte Folded Reload

	movapd	XMMWORD PTR [RSP + 144], XMM2 # 16-byte Spill

	movapd	XMM2, XMMWORD PTR [RIP + .LCPI2_14]

	movapd	XMM9, XMMWORD PTR [RSP + 128] # 16-byte Reload

	mulpd	XMM9, XMM2

	addpd	XMM9, XMMWORD PTR [RSP + 144] # 16-byte Folded Reload

	movapd	XMMWORD PTR [RSP + 128], XMM9 # 16-byte Spill

	movapd	XMM9, XMMWORD PTR [RIP + .LCPI2_15]

	mulpd	XMM3, XMM9

	addpd	XMM3, XMMWORD PTR [RSP + 128] # 16-byte Folded Reload

	movapd	XMM15, XMM3

	unpckhpd	XMM15, XMM15    # xmm15 = xmm15[1,1]

	addsd	XMM15, XMM3

	mulpd	XMM4, XMM5

	addpd	XMM4, XMMWORD PTR [RSP + 320] # 16-byte Folded Reload

	movapd	XMM3, XMMWORD PTR [RSP + 112] # 16-byte Reload

	mulpd	XMM3, XMM7

	addpd	XMM3, XMM4

	movapd	XMMWORD PTR [RSP + 112], XMM3 # 16-byte Spill

	movapd	XMM3, XMMWORD PTR [RSP + 272] # 16-byte Reload

	mulpd	XMM3, XMMWORD PTR [RIP + .LCPI2_11]

	addpd	XMM3, XMMWORD PTR [RSP + 112] # 16-byte Folded Reload

	movapd	XMMWORD PTR [RSP + 272], XMM3 # 16-byte Spill

	movapd	XMM3, XMMWORD PTR [RSP + 96] # 16-byte Reload

	mulpd	XMM3, XMMWORD PTR [RIP + .LCPI2_12]

	addpd	XMM3, XMMWORD PTR [RSP + 272] # 16-byte Folded Reload

	movapd	XMMWORD PTR [RSP + 96], XMM3 # 16-byte Spill

	movapd	XMM3, XMMWORD PTR [RSP + 80] # 16-byte Reload

	mulpd	XMM3, XMMWORD PTR [RIP + .LCPI2_13]

	addpd	XMM3, XMMWORD PTR [RSP + 96] # 16-byte Folded Reload

	movapd	XMMWORD PTR [RSP + 80], XMM3 # 16-byte Spill

	movapd	XMM3, XMMWORD PTR [RSP + 16] # 16-byte Reload

	mulpd	XMM3, XMM2

	addpd	XMM3, XMMWORD PTR [RSP + 80] # 16-byte Folded Reload

	movapd	XMMWORD PTR [RSP + 16], XMM3 # 16-byte Spill

	mulpd	XMM6, XMM9

	addpd	XMM6, XMM3

	movapd	XMM3, XMM6

	unpckhpd	XMM3, XMM3      # xmm3 = xmm3[1,1]

	addsd	XMM3, XMM6

	unpcklpd	XMM3, XMM15     # xmm3 = xmm3[0],xmm15[0]

	movapd	XMMWORD PTR [R15], XMM3

	movapd	XMM15, XMMWORD PTR [RSP + 32] # 16-byte Reload

	mulpd	XMM15, XMMWORD PTR [RIP + .LCPI2_8]

	movapd	XMMWORD PTR [RSP + 32], XMM15 # 16-byte Spill

	mulpd	XMM8, XMM5

	addpd	XMM8, XMM15

	mulpd	XMM14, XMM7

	addpd	XMM14, XMM8

	mulpd	XMM13, XMMWORD PTR [RIP + .LCPI2_11]

	addpd	XMM13, XMM14

	mulpd	XMM12, XMMWORD PTR [RIP + .LCPI2_12]

	addpd	XMM12, XMM13

	mulpd	XMM11, XMMWORD PTR [RIP + .LCPI2_13]

	addpd	XMM11, XMM12

	mulpd	XMM10, XMM2

	addpd	XMM10, XMM11

	mulpd	XMM0, XMM9

	addpd	XMM0, XMM10

	movapd	XMM3, XMM0

	unpckhpd	XMM3, XMM3      # xmm3 = xmm3[1,1]

	addsd	XMM3, XMM0

	movapd	XMM0, XMMWORD PTR [RSP + 176] # 16-byte Reload

	mulpd	XMM0, XMMWORD PTR [RIP + .LCPI2_8]

	movapd	XMMWORD PTR [RSP + 176], XMM0 # 16-byte Spill

	mulpd	XMM1, XMM5

	addpd	XMM1, XMM0

	movapd	XMM15, XMMWORD PTR [RSP + 160] # 16-byte Reload

	mulpd	XMM15, XMM7

	addpd	XMM15, XMM1

	movapd	XMMWORD PTR [RSP + 160], XMM15 # 16-byte Spill

	movapd	XMM14, XMMWORD PTR [RSP + 192] # 16-byte Reload

	mulpd	XMM14, XMMWORD PTR [RIP + .LCPI2_11]

	addpd	XMM14, XMM15

	movapd	XMMWORD PTR [RSP + 192], XMM14 # 16-byte Spill

	movapd	XMM13, XMMWORD PTR [RSP + 208] # 16-byte Reload

	mulpd	XMM13, XMMWORD PTR [RIP + .LCPI2_12]

	addpd	XMM13, XMM14

	movapd	XMMWORD PTR [RSP + 208], XMM13 # 16-byte Spill

	movapd	XMM12, XMMWORD PTR [RSP + 224] # 16-byte Reload

	mulpd	XMM12, XMMWORD PTR [RIP + .LCPI2_13]

	addpd	XMM12, XMM13

	movapd	XMMWORD PTR [RSP + 224], XMM12 # 16-byte Spill

	movapd	XMM11, XMMWORD PTR [RSP + 240] # 16-byte Reload

	mulpd	XMM11, XMM2

	addpd	XMM11, XMM12

	movapd	XMMWORD PTR [RSP + 240], XMM11 # 16-byte Spill

	movapd	XMM10, XMMWORD PTR [RSP + 48] # 16-byte Reload

	mulpd	XMM10, XMM9

	addpd	XMM10, XMM11

	movapd	XMM0, XMM10

	unpckhpd	XMM0, XMM0      # xmm0 = xmm0[1,1]

	addsd	XMM0, XMM10

	unpcklpd	XMM0, XMM3      # xmm0 = xmm0[0],xmm3[0]

	movapd	XMMWORD PTR [R14], XMM0

	inc	R13

	cmp	R13, QWORD PTR [RSP + 480]

	jb	.LBB2_1

# BB#2:                                 # %SyncBB29

	add	RSP, 344

	pop	RBX

	pop	R12

	pop	R13

	pop	R14

	pop	R15

	pop	RBP

	ret

.Ltmp2:

	.size	magic_filter5, .Ltmp2-magic_filter5

As can be seen, the OpenCL version is spilling a lot of registers. This effect halves the performances.
Am I doing something wrong? I would have expected code written in intrinsics to port quite straightforwardly to OpenCL vectorized language. Even though I was curious to see if the compiler would optimize the transposition using haddpd, I did not expect such differences.

Sorry for the long post with lots of code but I could not reduce further.

3 Beiträge / 0 neu
Letzter Beitrag
Nähere Informationen zur Compiler-Optimierung finden Sie in unserem Optimierungshinweis.

I am still struggling with this problem. Would there be a way in the compiler to bypass most of the LLVM optimizations and go straight to the code generation phase? This way the compiler would just be used to allocate registers, the way it happens when using intrinsics and a standard compiler.

One specific optimization you might want to disable first when dealing with explicitly vectorized code is OCL vectorizer itself.

Consider prefixing your kernel withvec_type_hint:

__kernel __attribute__((vec_type_hint(double2))) foo(...)

This is unlikely to be related to the register allocation though. If you are seeing too much spiils/fills it might indicate that you (for example) are having too much constants in use. Native (not OpenCL, which is just JIT) compiler might handle this situation somewhat better.

Kommentar hinterlassen

Bitte anmelden, um einen Kommentar hinzuzufügen. Sie sind noch nicht Mitglied? Jetzt teilnehmen