I have problems trying to translate code written with intrinsics to OpenCL vector code. I am working with double precision. The code is doing vectorized filtering of data using a filter of length 16.
The original code is (sorry but I could not reduce it further):
const double filter[] __attribute__ ((aligned (16))) = {
8.4334247333529341094733325815816e-7,
-0.1290557201342060969516786758559028e-4,
0.8762984476210559564689161894116397e-4,
-0.30158038132690463167163703826169879e-3,
0.174723713672993903449447812749852942e-2,
-0.942047030201080385922711540948195075e-2,
0.2373821463724942397566389712597274535e-1,
0.612625895831207982195380597e-1,
0.9940415697834003993178616713,
-0.604895289196983516002834636e-1,
-0.2103025160930381434955489412839065067e-1,
0.1337263414854794752733423467013220997e-1,
-0.344128144493493857280881509686821861e-2,
0.49443227688689919192282259476750972e-3,
-0.5185986881173432922848639136911487e-4,
2.72734492911979659657715313017228e-6};
const double filter_u[] __attribute__ ((aligned (16))) = {
2.72734492911979659657715313017228e-6,
8.4334247333529341094733325815816e-7,
-0.1290557201342060969516786758559028e-4,
0.8762984476210559564689161894116397e-4,
-0.30158038132690463167163703826169879e-3,
0.174723713672993903449447812749852942e-2,
-0.942047030201080385922711540948195075e-2,
0.2373821463724942397566389712597274535e-1,
0.612625895831207982195380597e-1,
0.9940415697834003993178616713,
-0.604895289196983516002834636e-1,
-0.2103025160930381434955489412839065067e-1,
0.1337263414854794752733423467013220997e-1,
-0.344128144493493857280881509686821861e-2,
0.49443227688689919192282259476750972e-3,
-0.5185986881173432922848639136911487e-4,
2.72734492911979659657715313017228e-6};
#define conv_4x2_block_fused(offset_filter,offset_source,d00,d10,d20,d30) \
FA = _mm_load_pd(filter+offset_filter);\
d00 = _mm_load_pd(source0+offset_source);\
S00 = _mm_add_pd(S00,_mm_mul_pd(d00,FA));\
FU = _mm_load_pd(filter_u+offset_filter);\
d10 = _mm_load_pd(source1+offset_source);\
S01 = _mm_add_pd(S01,_mm_mul_pd(d00,FU));\
S11 = _mm_add_pd(S11,_mm_mul_pd(d10,FU));\
d20 = _mm_load_pd(source2+offset_source);\
S10 = _mm_add_pd(S10,_mm_mul_pd(d10,FA));\
d30 = _mm_load_pd(source3+offset_source);\
S20 = _mm_add_pd(S20,_mm_mul_pd(d20,FA));\
S30 = _mm_add_pd(S30,_mm_mul_pd(d30,FA));\
S31 = _mm_add_pd(S31,_mm_mul_pd(d30,FU));\
S21 = _mm_add_pd(S21,_mm_mul_pd(d20,FU));
void conv_4x2_fused(size_t ndat, double const * source0, double const * source1,
double const * source2, double const * source3,
double * dest){
__m128d S00,S01,S10,S11,S20,S21,S30,S31;
__m128d FA,FU;
__m128d D00,D10,D20,D30;
FA = _mm_load_pd(filter);
D00 = _mm_load_pd(source0);
S00 = _mm_mul_pd(D00,FA);
D10 = _mm_load_pd(source1);
S10 = _mm_mul_pd(D10,FA);
D20 = _mm_load_pd(source2);
S20 = _mm_mul_pd(D20,FA);
D30 = _mm_load_pd(source3);
S30 = _mm_mul_pd(D30,FA);
FU = _mm_load_pd(filter_u);
S01 = _mm_loadl_pd(D00,source0+16);
S01 = _mm_mul_pd(S01,FU);
S11 = _mm_loadl_pd(D10,source1+16);
S11 = _mm_mul_pd(S11,FU);
S21 = _mm_loadl_pd(D20,source2+16);
S21 = _mm_mul_pd(S21,FU);
S31 = _mm_loadl_pd(D30,source3+16);
S31 = _mm_mul_pd(S31,FU);
conv_4x2_block_fused(2,2,D00,D10,D20,D30);
conv_4x2_block_fused(4,4,D00,D10,D20,D30);
conv_4x2_block_fused(6,6,D00,D10,D20,D30);
conv_4x2_block_fused(8,8,D00,D10,D20,D30);
conv_4x2_block_fused(10,10,D00,D10,D20,D30);
conv_4x2_block_fused(12,12,D00,D10,D20,D30);
conv_4x2_block_fused(14,14,D00,D10,D20,D30);
_mm_store_pd(dest,_mm_hadd_pd(S00,S10));
_mm_store_pd(dest+2,_mm_hadd_pd(S20,S30));
_mm_store_pd(dest+ndat,_mm_hadd_pd(S01,S11));
_mm_store_pd(dest+2+ndat,_mm_hadd_pd(S21,S31));
}
Here is the generated assembly (gcc-4.6, but icc generates similar code):
0000000000408970 : 408970: 66 0f 28 0e movapd (%rsi),%xmm1 408974: 66 0f 28 1a movapd (%rdx),%xmm3 408978: 66 0f 28 01 movapd (%rcx),%xmm0 40897c: 66 0f 28 e9 movapd %xmm1,%xmm5 408980: 66 0f 12 8e 80 00 00 movlpd 0x80(%rsi),%xmm1 408987: 00 408988: 66 0f 28 35 80 ee 00 movapd 0xee80(%rip),%xmm6 # 417810 40898f: 00 408990: 66 0f 28 fb movapd %xmm3,%xmm7 408994: 66 0f 12 9a 80 00 00 movlpd 0x80(%rdx),%xmm3 40899b: 00 40899c: 66 41 0f 28 10 movapd (%r8),%xmm2 4089a1: 66 0f 28 e0 movapd %xmm0,%xmm4 4089a5: 66 0f 12 81 80 00 00 movlpd 0x80(%rcx),%xmm0 4089ac: 00 4089ad: 66 44 0f 28 05 ca ed movapd 0xedca(%rip),%xmm8 # 417780 4089b4: 00 00 4089b6: 66 0f 59 ee mulpd %xmm6,%xmm5 4089ba: 66 44 0f 28 56 10 movapd 0x10(%rsi),%xmm10 4089c0: 66 0f 59 fe mulpd %xmm6,%xmm7 4089c4: 66 44 0f 28 0d 53 ee movapd 0xee53(%rip),%xmm9 # 417820 4089cb: 00 00 4089cd: 66 0f 59 e6 mulpd %xmm6,%xmm4 4089d1: 66 0f 59 f2 mulpd %xmm2,%xmm6 4089d5: 66 44 0f 28 5a 10 movapd 0x10(%rdx),%xmm11 4089db: 66 41 0f 12 90 80 00 movlpd 0x80(%r8),%xmm2 4089e2: 00 00 4089e4: 66 41 0f 59 c8 mulpd %xmm8,%xmm1 4089e9: 66 41 0f 59 d8 mulpd %xmm8,%xmm3 4089ee: 66 41 0f 59 c0 mulpd %xmm8,%xmm0 4089f3: 66 41 0f 59 d0 mulpd %xmm8,%xmm2 4089f8: 66 45 0f 28 c2 movapd %xmm10,%xmm8 4089fd: 66 45 0f 59 c1 mulpd %xmm9,%xmm8 408a02: 66 41 0f 58 e8 addpd %xmm8,%xmm5 408a07: 66 44 0f 28 05 80 ed movapd 0xed80(%rip),%xmm8 # 417790 408a0e: 00 00 408a10: 66 45 0f 59 d0 mulpd %xmm8,%xmm10 408a15: 66 41 0f 58 ca addpd %xmm10,%xmm1 408a1a: 66 45 0f 28 d3 movapd %xmm11,%xmm10 408a1f: 66 45 0f 59 d0 mulpd %xmm8,%xmm10 408a24: 66 45 0f 59 d9 mulpd %xmm9,%xmm11 408a29: 66 41 0f 58 da addpd %xmm10,%xmm3 408a2e: 66 44 0f 28 51 10 movapd 0x10(%rcx),%xmm10 408a34: 66 41 0f 58 fb addpd %xmm11,%xmm7 408a39: 66 45 0f 28 58 10 movapd 0x10(%r8),%xmm11 408a3f: 66 45 0f 28 e2 movapd %xmm10,%xmm12 408a44: 66 45 0f 59 e1 mulpd %xmm9,%xmm12 408a49: 66 45 0f 59 cb mulpd %xmm11,%xmm9 408a4e: 66 45 0f 59 d8 mulpd %xmm8,%xmm11 408a53: 66 45 0f 59 c2 mulpd %xmm10,%xmm8 408a58: 66 44 0f 28 56 20 movapd 0x20(%rsi),%xmm10 408a5e: 66 41 0f 58 e4 addpd %xmm12,%xmm4 408a63: 66 41 0f 58 f1 addpd %xmm9,%xmm6 408a68: 66 44 0f 28 0d bf ed movapd 0xedbf(%rip),%xmm9 # 417830 408a6f: 00 00 408a71: 66 41 0f 58 d3 addpd %xmm11,%xmm2 408a76: 66 44 0f 28 5a 20 movapd 0x20(%rdx),%xmm11 408a7c: 66 41 0f 58 c0 addpd %xmm8,%xmm0 408a81: 66 45 0f 28 c2 movapd %xmm10,%xmm8 408a86: 66 45 0f 59 c1 mulpd %xmm9,%xmm8 408a8b: 66 41 0f 58 e8 addpd %xmm8,%xmm5 408a90: 66 44 0f 28 05 07 ed movapd 0xed07(%rip),%xmm8 # 4177a0 408a97: 00 00 408a99: 66 45 0f 59 d0 mulpd %xmm8,%xmm10 408a9e: 66 41 0f 58 ca addpd %xmm10,%xmm1 408aa3: 66 45 0f 28 d3 movapd %xmm11,%xmm10 408aa8: 66 45 0f 59 d0 mulpd %xmm8,%xmm10 408aad: 66 45 0f 59 d9 mulpd %xmm9,%xmm11 408ab2: 66 41 0f 58 da addpd %xmm10,%xmm3 408ab7: 66 44 0f 28 51 20 movapd 0x20(%rcx),%xmm10 408abd: 66 41 0f 58 fb addpd %xmm11,%xmm7 408ac2: 66 45 0f 28 58 20 movapd 0x20(%r8),%xmm11 408ac8: 66 45 0f 28 e2 movapd %xmm10,%xmm12 408acd: 66 45 0f 59 e1 mulpd %xmm9,%xmm12 408ad2: 66 45 0f 59 cb mulpd %xmm11,%xmm9 408ad7: 66 45 0f 59 d8 mulpd %xmm8,%xmm11 408adc: 66 45 0f 59 c2 mulpd %xmm10,%xmm8 408ae1: 66 44 0f 28 56 30 movapd 0x30(%rsi),%xmm10 408ae7: 66 41 0f 58 e4 addpd %xmm12,%xmm4 408aec: 66 41 0f 58 f1 addpd %xmm9,%xmm6 408af1: 66 44 0f 28 0d 46 ed movapd 0xed46(%rip),%xmm9 # 417840 408af8: 00 00 408afa: 66 41 0f 58 d3 addpd %xmm11,%xmm2 408aff: 66 44 0f 28 5a 30 movapd 0x30(%rdx),%xmm11 408b05: 66 41 0f 58 c0 addpd %xmm8,%xmm0 408b0a: 66 45 0f 28 c2 movapd %xmm10,%xmm8 408b0f: 66 45 0f 59 c1 mulpd %xmm9,%xmm8 408b14: 66 41 0f 58 e8 addpd %xmm8,%xmm5 408b19: 66 44 0f 28 05 8e ec movapd 0xec8e(%rip),%xmm8 # 4177b0 408b20: 00 00 408b22: 66 45 0f 59 d0 mulpd %xmm8,%xmm10 408b27: 66 41 0f 58 ca addpd %xmm10,%xmm1 408b2c: 66 45 0f 28 d3 movapd %xmm11,%xmm10 408b31: 66 45 0f 59 d0 mulpd %xmm8,%xmm10 408b36: 66 45 0f 59 d9 mulpd %xmm9,%xmm11 408b3b: 66 41 0f 58 da addpd %xmm10,%xmm3 408b40: 66 44 0f 28 51 30 movapd 0x30(%rcx),%xmm10 408b46: 66 41 0f 58 fb addpd %xmm11,%xmm7 408b4b: 66 45 0f 28 58 30 movapd 0x30(%r8),%xmm11 408b51: 66 45 0f 28 e2 movapd %xmm10,%xmm12 408b56: 66 45 0f 59 e1 mulpd %xmm9,%xmm12 408b5b: 66 45 0f 59 cb mulpd %xmm11,%xmm9 408b60: 66 45 0f 59 d8 mulpd %xmm8,%xmm11 408b65: 66 45 0f 59 c2 mulpd %xmm10,%xmm8 408b6a: 66 44 0f 28 56 40 movapd 0x40(%rsi),%xmm10 408b70: 66 41 0f 58 e4 addpd %xmm12,%xmm4 408b75: 66 41 0f 58 f1 addpd %xmm9,%xmm6 408b7a: 66 44 0f 28 0d cd ec movapd 0xeccd(%rip),%xmm9 # 417850 408b81: 00 00 408b83: 66 41 0f 58 d3 addpd %xmm11,%xmm2 408b88: 66 44 0f 28 5a 40 movapd 0x40(%rdx),%xmm11 408b8e: 66 41 0f 58 c0 addpd %xmm8,%xmm0 408b93: 66 45 0f 28 c2 movapd %xmm10,%xmm8 408b98: 66 45 0f 59 c1 mulpd %xmm9,%xmm8 408b9d: 66 41 0f 58 e8 addpd %xmm8,%xmm5 408ba2: 66 44 0f 28 05 15 ec movapd 0xec15(%rip),%xmm8 # 4177c0 408ba9: 00 00 408bab: 66 45 0f 59 d0 mulpd %xmm8,%xmm10 408bb0: 66 41 0f 58 ca addpd %xmm10,%xmm1 408bb5: 66 45 0f 28 d3 movapd %xmm11,%xmm10 408bba: 66 45 0f 59 d0 mulpd %xmm8,%xmm10 408bbf: 66 45 0f 59 d9 mulpd %xmm9,%xmm11 408bc4: 66 41 0f 58 da addpd %xmm10,%xmm3 408bc9: 66 44 0f 28 51 40 movapd 0x40(%rcx),%xmm10 408bcf: 66 41 0f 58 fb addpd %xmm11,%xmm7 408bd4: 66 45 0f 28 58 40 movapd 0x40(%r8),%xmm11 408bda: 66 45 0f 28 e2 movapd %xmm10,%xmm12 408bdf: 66 45 0f 59 e1 mulpd %xmm9,%xmm12 408be4: 66 45 0f 59 cb mulpd %xmm11,%xmm9 408be9: 66 45 0f 59 d8 mulpd %xmm8,%xmm11 408bee: 66 45 0f 59 c2 mulpd %xmm10,%xmm8 408bf3: 66 44 0f 28 56 50 movapd 0x50(%rsi),%xmm10 408bf9: 66 41 0f 58 e4 addpd %xmm12,%xmm4 408bfe: 66 41 0f 58 f1 addpd %xmm9,%xmm6 408c03: 66 44 0f 28 0d 54 ec movapd 0xec54(%rip),%xmm9 # 417860 408c0a: 00 00 408c0c: 66 41 0f 58 d3 addpd %xmm11,%xmm2 408c11: 66 44 0f 28 5a 50 movapd 0x50(%rdx),%xmm11 408c17: 66 41 0f 58 c0 addpd %xmm8,%xmm0 408c1c: 66 45 0f 28 c2 movapd %xmm10,%xmm8 408c21: 66 45 0f 59 c1 mulpd %xmm9,%xmm8 408c26: 66 41 0f 58 e8 addpd %xmm8,%xmm5 408c2b: 66 44 0f 28 05 9c eb movapd 0xeb9c(%rip),%xmm8 # 4177d0 408c32: 00 00 408c34: 66 45 0f 59 d0 mulpd %xmm8,%xmm10 408c39: 66 41 0f 58 ca addpd %xmm10,%xmm1 408c3e: 66 45 0f 28 d3 movapd %xmm11,%xmm10 408c43: 66 45 0f 59 d0 mulpd %xmm8,%xmm10 408c48: 66 45 0f 59 d9 mulpd %xmm9,%xmm11 408c4d: 66 41 0f 58 da addpd %xmm10,%xmm3 408c52: 66 44 0f 28 51 50 movapd 0x50(%rcx),%xmm10 408c58: 66 41 0f 58 fb addpd %xmm11,%xmm7 408c5d: 66 45 0f 28 58 50 movapd 0x50(%r8),%xmm11 408c63: 66 45 0f 28 e2 movapd %xmm10,%xmm12 408c68: 66 45 0f 59 e1 mulpd %xmm9,%xmm12 408c6d: 66 45 0f 59 cb mulpd %xmm11,%xmm9 408c72: 66 45 0f 59 d8 mulpd %xmm8,%xmm11 408c77: 66 45 0f 59 c2 mulpd %xmm10,%xmm8 408c7c: 66 44 0f 28 56 60 movapd 0x60(%rsi),%xmm10 408c82: 66 41 0f 58 e4 addpd %xmm12,%xmm4 408c87: 66 41 0f 58 f1 addpd %xmm9,%xmm6 408c8c: 66 44 0f 28 0d db eb movapd 0xebdb(%rip),%xmm9 # 417870 408c93: 00 00 408c95: 66 41 0f 58 d3 addpd %xmm11,%xmm2 408c9a: 66 44 0f 28 5a 60 movapd 0x60(%rdx),%xmm11 408ca0: 66 41 0f 58 c0 addpd %xmm8,%xmm0 408ca5: 66 45 0f 28 c2 movapd %xmm10,%xmm8 408caa: 66 45 0f 59 c1 mulpd %xmm9,%xmm8 408caf: 66 41 0f 58 e8 addpd %xmm8,%xmm5 408cb4: 66 44 0f 28 05 23 eb movapd 0xeb23(%rip),%xmm8 # 4177e0 408cbb: 00 00 408cbd: 66 45 0f 59 d0 mulpd %xmm8,%xmm10 408cc2: 66 41 0f 58 ca addpd %xmm10,%xmm1 408cc7: 66 45 0f 28 d3 movapd %xmm11,%xmm10 408ccc: 66 45 0f 59 d0 mulpd %xmm8,%xmm10 408cd1: 66 45 0f 59 d9 mulpd %xmm9,%xmm11 408cd6: 66 41 0f 58 da addpd %xmm10,%xmm3 408cdb: 66 44 0f 28 51 60 movapd 0x60(%rcx),%xmm10 408ce1: 66 41 0f 58 fb addpd %xmm11,%xmm7 408ce6: 66 45 0f 28 58 60 movapd 0x60(%r8),%xmm11 408cec: 66 45 0f 28 e2 movapd %xmm10,%xmm12 408cf1: 66 45 0f 59 e1 mulpd %xmm9,%xmm12 408cf6: 66 45 0f 59 cb mulpd %xmm11,%xmm9 408cfb: 66 45 0f 59 d8 mulpd %xmm8,%xmm11 408d00: 66 45 0f 59 c2 mulpd %xmm10,%xmm8 408d05: 66 44 0f 28 56 70 movapd 0x70(%rsi),%xmm10 408d0b: 66 41 0f 58 e4 addpd %xmm12,%xmm4 408d10: 66 41 0f 58 f1 addpd %xmm9,%xmm6 408d15: 66 44 0f 28 0d 62 eb movapd 0xeb62(%rip),%xmm9 # 417880 408d1c: 00 00 408d1e: 66 41 0f 58 d3 addpd %xmm11,%xmm2 408d23: 66 44 0f 28 5a 70 movapd 0x70(%rdx),%xmm11 408d29: 66 41 0f 58 c0 addpd %xmm8,%xmm0 408d2e: 66 45 0f 28 c2 movapd %xmm10,%xmm8 408d33: 66 45 0f 59 c1 mulpd %xmm9,%xmm8 408d38: 66 41 0f 58 e8 addpd %xmm8,%xmm5 408d3d: 66 44 0f 28 05 aa ea movapd 0xeaaa(%rip),%xmm8 # 4177f0 408d44: 00 00 408d46: 66 45 0f 59 d0 mulpd %xmm8,%xmm10 408d4b: 66 41 0f 58 ca addpd %xmm10,%xmm1 408d50: 66 45 0f 28 d3 movapd %xmm11,%xmm10 408d55: 66 45 0f 59 d0 mulpd %xmm8,%xmm10 408d5a: 66 45 0f 59 d9 mulpd %xmm9,%xmm11 408d5f: 66 41 0f 58 da addpd %xmm10,%xmm3 408d64: 66 44 0f 28 51 70 movapd 0x70(%rcx),%xmm10 408d6a: 66 41 0f 58 fb addpd %xmm11,%xmm7 408d6f: 66 45 0f 28 58 70 movapd 0x70(%r8),%xmm11 408d75: 66 45 0f 28 e2 movapd %xmm10,%xmm12 408d7a: 66 45 0f 59 e1 mulpd %xmm9,%xmm12 408d7f: 66 0f 7c cb haddpd %xmm3,%xmm1 408d83: 66 45 0f 59 cb mulpd %xmm11,%xmm9 408d88: 66 0f 7c ef haddpd %xmm7,%xmm5 408d8c: 66 45 0f 59 d8 mulpd %xmm8,%xmm11 408d91: 66 45 0f 59 c2 mulpd %xmm10,%xmm8 408d96: 66 41 0f 29 29 movapd %xmm5,(%r9) 408d9b: 66 41 0f 58 e4 addpd %xmm12,%xmm4 408da0: 66 41 0f 58 f1 addpd %xmm9,%xmm6 408da5: 66 41 0f 58 d3 addpd %xmm11,%xmm2 408daa: 66 41 0f 58 c0 addpd %xmm8,%xmm0 408daf: 66 0f 7c e6 haddpd %xmm6,%xmm4 408db3: 66 0f 7c c2 haddpd %xmm2,%xmm0 408db7: 66 41 0f 29 61 10 movapd %xmm4,0x10(%r9) 408dbd: 66 41 0f 29 0c f9 movapd %xmm1,(%r9,%rdi,8) 408dc3: 66 41 0f 29 44 f9 10 movapd %xmm0,0x10(%r9,%rdi,8) 408dca: c3 retq 408dcb: 0f 1f 44 00 00 nopl 0x0(%rax,%rax,1)
Here is the OpenCL equivalent code:
#ifdef cl_khr_fp64
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
#endif
const double2 FA[8] = {
(double2)(8.4334247333529341094733325815816e-7,
-0.1290557201342060969516786758559028e-4),
(double2)(0.8762984476210559564689161894116397e-4,
-0.30158038132690463167163703826169879e-3),
(double2)(0.174723713672993903449447812749852942e-2,
-0.942047030201080385922711540948195075e-2),
(double2)(0.2373821463724942397566389712597274535e-1,
0.612625895831207982195380597e-1),
(double2)(0.9940415697834003993178616713,
-0.604895289196983516002834636e-1),
(double2)(-0.2103025160930381434955489412839065067e-1,
0.1337263414854794752733423467013220997e-1),
(double2)(-0.344128144493493857280881509686821861e-2,
0.49443227688689919192282259476750972e-3),
(double2)(-0.5185986881173432922848639136911487e-4,
2.72734492911979659657715313017228e-6)};
const double2 FU[8] = {
(double2)(2.72734492911979659657715313017228e-6,
8.4334247333529341094733325815816e-7),
(double2)(-0.1290557201342060969516786758559028e-4,
0.8762984476210559564689161894116397e-4),
(double2)(-0.30158038132690463167163703826169879e-3,
0.174723713672993903449447812749852942e-2),
(double2)(-0.942047030201080385922711540948195075e-2,
0.2373821463724942397566389712597274535e-1),
(double2)(0.612625895831207982195380597e-1,
0.9940415697834003993178616713),
(double2)(-0.604895289196983516002834636e-1,
-0.2103025160930381434955489412839065067e-1),
(double2)(0.1337263414854794752733423467013220997e-1,
-0.344128144493493857280881509686821861e-2),
(double2)(0.49443227688689919192282259476750972e-3,
-0.5185986881173432922848639136911487e-4)};
#define conv_4x2_block_fused(o_f,o_s,d00,d10,d20,d30)
d00 = in0[o_s];
A = FA[o_f];
S00 += d00 * A;
U = FU[o_f];
d10 = in1[o_s];
S01 += d00 * U;
S11 += d10 * U;
d20 = in2[o_s];
S10 += d10 * A;
d30 = in3[o_s];
S20 += d20 * A;
S30 += d30 * A;
S31 += d30 * U;
S21 += d20 * U;
__kernel void magic_filter5(const ulong n, const ulong ndat, __global const double2 *in, __global double2 *out) {
const unsigned int n2 = n/2;
const unsigned int ndat2 = ndat/2;
__global const double2 *in0 = in;
__global const double2 *in1 = in0 + n2;
__global const double2 *in2 = in1 + n2;
__global const double2 *in3 = in2 + n2;
__global double2 *out = out + 4;
double2 D00,D10,D20,D30;
double2 S00,S01,S10,S11,S20,S21,S30,S31;
double2 A, U;
A = FA[0];
D00 = in0[0];
S00 = D00 * A;
D10 = in1[0];
S10 = D10 * A;
D20 = in2[0];
S20 = D20 * A;
D30 = in3[0];
S30 = D30 * A;
U = FU[0];
D00.x = in0[8].x;
S01 = D00 * U;
D10.x = in1[8].x;
S11 = D10 * U;
D20.x = in2[8].x;
S21 = D20 * U;
D30.x = in3[8].x;
S31 = D30 * U;
conv_4x2_block_fused(1,1,D00,D10,D20,D30);
conv_4x2_block_fused(2,2,D00,D10,D20,D30);
conv_4x2_block_fused(3,3,D00,D10,D20,D30);
conv_4x2_block_fused(4,4,D00,D10,D20,D30);
conv_4x2_block_fused(5,5,D00,D10,D20,D30);
conv_4x2_block_fused(6,6,D00,D10,D20,D30);
conv_4x2_block_fused(7,7,D00,D10,D20,D30);
out[0] = (double2)(S00.x + S00.y, S10.x + S10.y);
out[1] = (double2)(S20.x + S20.y, S30.x + S30.y);
out[ndat2 + 0] = (double2)(S01.x + S01.y, S11.x + S11.y);
out[ndat2 + 1] = (double2)(S21.x + S21.y, S31.x + S31.y);
} And here is the generated assembly:
.section .rodata.cst16,"aM",@progbits,16
.align 16
.LCPI2_0: # constant pool <2 x double>
.quad 4546094365667641806 # double 8.762984e-05
.quad -4669173237098585703 # double -3.015804e-04
.LCPI2_1: # constant pool <2 x double>
.quad 4516068371457184450 # double 8.433425e-07
.quad -4689636306257622577 # double -1.290557e-05
.LCPI2_2: # constant pool <2 x double>
.quad 4565700531973276514 # double 1.747237e-03
.quad -4646787883676373161 # double -9.420470e-03
.LCPI2_3: # constant pool <2 x double>
.quad 4582499295942488407 # double 2.373821e-02
.quad 4588989690655579855 # double 6.126259e-02
.LCPI2_4: # constant pool <2 x double>
.quad 4607128750031811026 # double 9.940416e-01
.quad -4634493755982099125 # double -6.048953e-02
.LCPI2_5: # constant pool <2 x double>
.quad -4641653258114437092 # double -2.103025e-02
.quad 4578862420522603841 # double 1.337263e-02
.LCPI2_6: # constant pool <2 x double>
.quad -4653290559565923332 # double -3.441281e-03
.quad 4557699556108328520 # double 4.944323e-04
.LCPI2_7: # constant pool <2 x double>
.quad -4680594046271356616 # double -5.185987e-05
.quad 4523550187392857396 # double 2.727345e-06
.LCPI2_8: # constant pool <2 x double>
.quad -4689636306257622577 # double -1.290557e-05
.quad 4546094365667641806 # double 8.762984e-05
.LCPI2_9: # constant pool <2 x double>
.quad 4523550187392857396 # double 2.727345e-06
.quad 4516068371457184450 # double 8.433425e-07
.LCPI2_10: # constant pool <2 x double>
.quad -4669173237098585703 # double -3.015804e-04
.quad 4565700531973276514 # double 1.747237e-03
.LCPI2_11: # constant pool <2 x double>
.quad -4646787883676373161 # double -9.420470e-03
.quad 4582499295942488407 # double 2.373821e-02
.LCPI2_12: # constant pool <2 x double>
.quad 4588989690655579855 # double 6.126259e-02
.quad 4607128750031811026 # double 9.940416e-01
.LCPI2_13: # constant pool <2 x double>
.quad -4634493755982099125 # double -6.048953e-02
.quad -4641653258114437092 # double -2.103025e-02
.LCPI2_14: # constant pool <2 x double>
.quad 4578862420522603841 # double 1.337263e-02
.quad -4653290559565923332 # double -3.441281e-03
.LCPI2_15: # constant pool <2 x double>
.quad 4557699556108328520 # double 4.944323e-04
.quad -4680594046271356616 # double -5.185987e-05
.text
.globl magic_filter5
.align 16, 0x90
.type magic_filter5,@function
magic_filter5: # @magic_filter5
# BB#0: # %FirstBB
push RBP
push R15
push R14
push R13
push R12
push RBX
sub RSP, 344
mov RAX, QWORD PTR [RSP + 400]
shr RAX
mov EAX, EAX
lea RCX, QWORD PTR [RAX + 2*RAX]
mov RDX, RAX
shl RDX, 5
mov RSI, QWORD PTR [RSP + 416]
lea RDI, QWORD PTR [RDX + RSI + 128]
mov QWORD PTR [RSP + 8], RDI # 8-byte Spill
lea RDI, QWORD PTR [RSI + RDX + 16]
mov QWORD PTR [RSP], RDI # 8-byte Spill
lea RDI, QWORD PTR [RDX + RSI + 32]
mov QWORD PTR [RSP - 8], RDI # 8-byte Spill
lea RDI, QWORD PTR [RDX + RSI + 48]
mov QWORD PTR [RSP - 16], RDI # 8-byte Spill
lea RDI, QWORD PTR [RDX + RSI + 64]
mov QWORD PTR [RSP - 24], RDI # 8-byte Spill
lea RDI, QWORD PTR [RDX + RSI + 80]
mov QWORD PTR [RSP - 32], RDI # 8-byte Spill
lea RDI, QWORD PTR [RDX + RSI + 96]
mov QWORD PTR [RSP - 40], RDI # 8-byte Spill
lea RDI, QWORD PTR [RDX + RSI + 112]
mov QWORD PTR [RSP - 48], RDI # 8-byte Spill
add RDX, RSI
shl RAX, 4
lea RDI, QWORD PTR [RSI + RAX + 128]
mov QWORD PTR [RSP - 56], RDI # 8-byte Spill
lea RDI, QWORD PTR [RSI + RAX + 16]
mov QWORD PTR [RSP - 64], RDI # 8-byte Spill
lea RDI, QWORD PTR [RSI + RAX + 32]
mov QWORD PTR [RSP - 72], RDI # 8-byte Spill
lea RDI, QWORD PTR [RSI + RAX + 48]
mov QWORD PTR [RSP - 80], RDI # 8-byte Spill
lea RDI, QWORD PTR [RSI + RAX + 64]
mov QWORD PTR [RSP - 88], RDI # 8-byte Spill
lea RDI, QWORD PTR [RSI + RAX + 80]
mov QWORD PTR [RSP - 96], RDI # 8-byte Spill
lea RDI, QWORD PTR [RSI + RAX + 96]
mov QWORD PTR [RSP - 104], RDI # 8-byte Spill
lea RDI, QWORD PTR [RSI + RAX + 112]
mov QWORD PTR [RSP - 112], RDI # 8-byte Spill
shl RCX, 4
lea RDI, QWORD PTR [RSI + RCX + 128]
mov QWORD PTR [RSP - 120], RDI # 8-byte Spill
lea RDI, QWORD PTR [RSI + RCX + 16]
mov QWORD PTR [RSP - 128], RDI # 8-byte Spill
lea RDI, QWORD PTR [RSI + RCX + 32]
lea R8, QWORD PTR [RSI + RCX + 48]
lea R9, QWORD PTR [RSI + RCX + 64]
lea R10, QWORD PTR [RSI + RCX + 80]
lea R11, QWORD PTR [RSI + RCX + 96]
lea RBX, QWORD PTR [RSI + RCX + 112]
mov R14, QWORD PTR [RSP + 408]
movabs R15, 8589934590
and R15, R14
mov R12, QWORD PTR [RSP + 424]
lea R15, QWORD PTR [R12 + 8*R15 + 64]
shr R14
inc R14D
shl R14, 4
lea R14, QWORD PTR [R14 + R12 + 64]
add RCX, RSI
add RAX, RSI
mov R13, -1
.align 16, 0x90
.LBB2_1: # %SyncBB
# =>This Inner Loop Header: Depth=1
mov RBP, QWORD PTR [RSP - 64] # 8-byte Reload
movapd XMM0, XMMWORD PTR [RBP]
movapd XMMWORD PTR [RSP + 304], XMM0 # 16-byte Spill
movapd XMM1, XMM0
mulpd XMM1, XMMWORD PTR [RIP + .LCPI2_0]
movapd XMM2, XMMWORD PTR [RAX]
movapd XMM3, XMM2
mulpd XMM3, XMMWORD PTR [RIP + .LCPI2_1]
addpd XMM3, XMM1
mov RBP, QWORD PTR [RSP - 72] # 8-byte Reload
movapd XMM0, XMMWORD PTR [RBP]
movapd XMMWORD PTR [RSP + 64], XMM0 # 16-byte Spill
movapd XMM1, XMM0
mulpd XMM1, XMMWORD PTR [RIP + .LCPI2_2]
addpd XMM1, XMM3
mov RBP, QWORD PTR [RSP - 80] # 8-byte Reload
movapd XMM0, XMMWORD PTR [RBP]
movapd XMMWORD PTR [RSP + 288], XMM0 # 16-byte Spill
movapd XMM3, XMM0
mulpd XMM3, XMMWORD PTR [RIP + .LCPI2_3]
addpd XMM3, XMM1
mov RBP, QWORD PTR [RSP - 88] # 8-byte Reload
movapd XMM0, XMMWORD PTR [RBP]
movapd XMMWORD PTR [RSP + 256], XMM0 # 16-byte Spill
movapd XMM1, XMM0
mulpd XMM1, XMMWORD PTR [RIP + .LCPI2_4]
addpd XMM1, XMM3
mov RBP, QWORD PTR [RSP - 96] # 8-byte Reload
movapd XMM0, XMMWORD PTR [RBP]
movapd XMMWORD PTR [RSP + 144], XMM0 # 16-byte Spill
movapd XMM3, XMM0
mulpd XMM3, XMMWORD PTR [RIP + .LCPI2_5]
addpd XMM3, XMM1
mov RBP, QWORD PTR [RSP - 104] # 8-byte Reload
movapd XMM0, XMMWORD PTR [RBP]
movapd XMMWORD PTR [RSP + 128], XMM0 # 16-byte Spill
movapd XMM1, XMM0
mulpd XMM1, XMMWORD PTR [RIP + .LCPI2_6]
addpd XMM1, XMM3
mov RBP, QWORD PTR [RSP - 112] # 8-byte Reload
movapd XMM3, XMMWORD PTR [RBP]
movapd XMM4, XMM3
mulpd XMM4, XMMWORD PTR [RIP + .LCPI2_7]
addpd XMM4, XMM1
movapd XMM1, XMM4
unpckhpd XMM1, XMM1 # xmm1 = xmm1[1,1]
addsd XMM1, XMM4
movapd XMM4, XMMWORD PTR [RSI]
movapd XMM5, XMMWORD PTR [RSI + 16]
movapd XMMWORD PTR [RSP + 320], XMM5 # 16-byte Spill
movapd XMM0, XMMWORD PTR [RSI + 32]
movapd XMMWORD PTR [RSP + 112], XMM0 # 16-byte Spill
movapd XMM5, XMMWORD PTR [RSI + 48]
movapd XMMWORD PTR [RSP + 272], XMM5 # 16-byte Spill
movapd XMM6, XMMWORD PTR [RSP + 320] # 16-byte Reload
mulpd XMM6, XMMWORD PTR [RIP + .LCPI2_0]
movapd XMM7, XMM4
mulpd XMM7, XMMWORD PTR [RIP + .LCPI2_1]
addpd XMM7, XMM6
movapd XMM6, XMM0
mulpd XMM6, XMMWORD PTR [RIP + .LCPI2_2]
addpd XMM6, XMM7
movapd XMM7, XMM5
mulpd XMM7, XMMWORD PTR [RIP + .LCPI2_3]
addpd XMM7, XMM6
movapd XMM0, XMMWORD PTR [RSI + 64]
movapd XMMWORD PTR [RSP + 96], XMM0 # 16-byte Spill
movapd XMM5, XMM0
mulpd XMM5, XMMWORD PTR [RIP + .LCPI2_4]
addpd XMM5, XMM7
movapd XMM0, XMMWORD PTR [RSI + 80]
movapd XMMWORD PTR [RSP + 80], XMM0 # 16-byte Spill
movapd XMM6, XMM0
mulpd XMM6, XMMWORD PTR [RIP + .LCPI2_5]
addpd XMM6, XMM5
movapd XMM0, XMMWORD PTR [RSI + 96]
movapd XMMWORD PTR [RSP + 16], XMM0 # 16-byte Spill
movapd XMM5, XMM0
mulpd XMM5, XMMWORD PTR [RIP + .LCPI2_6]
addpd XMM5, XMM6
movapd XMM6, XMMWORD PTR [RSI + 112]
movapd XMM7, XMM6
mulpd XMM7, XMMWORD PTR [RIP + .LCPI2_7]
addpd XMM7, XMM5
movapd XMM5, XMM7
unpckhpd XMM5, XMM5 # xmm5 = xmm5[1,1]
addsd XMM5, XMM7
unpcklpd XMM5, XMM1 # xmm5 = xmm5[0],xmm1[0]
movapd XMM1, XMMWORD PTR [RDX]
movapd XMM7, XMM1
mulpd XMM7, XMMWORD PTR [RIP + .LCPI2_1]
mov RBP, QWORD PTR [RSP + 8] # 8-byte Reload
movlpd XMM1, QWORD PTR [RBP]
movapd XMM8, XMMWORD PTR [RCX]
movapd XMM9, XMM8
mulpd XMM9, XMMWORD PTR [RIP + .LCPI2_1]
mov RBP, QWORD PTR [RSP - 120] # 8-byte Reload
movlpd XMM8, QWORD PTR [RBP]
movapd XMM10, XMMWORD PTR [RSP + 320] # 16-byte Reload
mulpd XMM10, XMMWORD PTR [RIP + .LCPI2_8]
movapd XMMWORD PTR [RSP + 320], XMM10 # 16-byte Spill
movlpd XMM4, QWORD PTR [RSI + 128]
mov RBP, QWORD PTR [RSP - 56] # 8-byte Reload
movlpd XMM2, QWORD PTR [RBP]
mov RBP, QWORD PTR [RSP - 48] # 8-byte Reload
movapd XMM10, XMMWORD PTR [RBP]
movapd XMMWORD PTR [RSP + 48], XMM10 # 16-byte Spill
mov RBP, QWORD PTR [RSP - 40] # 8-byte Reload
movapd XMM11, XMMWORD PTR [RBP]
movapd XMMWORD PTR [RSP + 240], XMM11 # 16-byte Spill
mov RBP, QWORD PTR [RSP - 32] # 8-byte Reload
movapd XMM12, XMMWORD PTR [RBP]
movapd XMMWORD PTR [RSP + 224], XMM12 # 16-byte Spill
mov RBP, QWORD PTR [RSP - 24] # 8-byte Reload
movapd XMM13, XMMWORD PTR [RBP]
movapd XMMWORD PTR [RSP + 208], XMM13 # 16-byte Spill
mov RBP, QWORD PTR [RSP - 16] # 8-byte Reload
movapd XMM14, XMMWORD PTR [RBP]
movapd XMMWORD PTR [RSP + 192], XMM14 # 16-byte Spill
mov RBP, QWORD PTR [RSP - 8] # 8-byte Reload
movapd XMM15, XMMWORD PTR [RBP]
movapd XMMWORD PTR [RSP + 160], XMM15 # 16-byte Spill
mov RBP, QWORD PTR [RSP] # 8-byte Reload
movapd XMM0, XMMWORD PTR [RBP]
movapd XMMWORD PTR [RSP + 176], XMM0 # 16-byte Spill
movapd XMM0, XMMWORD PTR [RBX]
movapd XMM10, XMMWORD PTR [R11]
movapd XMM11, XMMWORD PTR [R10]
movapd XMM12, XMMWORD PTR [R9]
movapd XMM13, XMMWORD PTR [R8]
movapd XMM14, XMMWORD PTR [RDI]
mov RBP, QWORD PTR [RSP - 128] # 8-byte Reload
movapd XMM15, XMMWORD PTR [RBP]
movapd XMMWORD PTR [RSP + 32], XMM15 # 16-byte Spill
movapd XMMWORD PTR [R12 + 64], XMM5
movapd XMM5, XMM15
mulpd XMM5, XMMWORD PTR [RIP + .LCPI2_0]
addpd XMM5, XMM9
movapd XMM9, XMM14
mulpd XMM9, XMMWORD PTR [RIP + .LCPI2_2]
addpd XMM9, XMM5
movapd XMM5, XMM13
mulpd XMM5, XMMWORD PTR [RIP + .LCPI2_3]
addpd XMM5, XMM9
movapd XMM9, XMM12
mulpd XMM9, XMMWORD PTR [RIP + .LCPI2_4]
addpd XMM9, XMM5
movapd XMM5, XMM11
mulpd XMM5, XMMWORD PTR [RIP + .LCPI2_5]
addpd XMM5, XMM9
movapd XMM9, XMM10
mulpd XMM9, XMMWORD PTR [RIP + .LCPI2_6]
addpd XMM9, XMM5
movapd XMM5, XMM0
mulpd XMM5, XMMWORD PTR [RIP + .LCPI2_7]
addpd XMM5, XMM9
movapd XMM9, XMM5
unpckhpd XMM9, XMM9 # xmm9 = xmm9[1,1]
addsd XMM9, XMM5
movapd XMM5, XMMWORD PTR [RSP + 176] # 16-byte Reload
mulpd XMM5, XMMWORD PTR [RIP + .LCPI2_0]
addpd XMM5, XMM7
movapd XMM7, XMMWORD PTR [RSP + 160] # 16-byte Reload
mulpd XMM7, XMMWORD PTR [RIP + .LCPI2_2]
addpd XMM7, XMM5
movapd XMM5, XMMWORD PTR [RSP + 192] # 16-byte Reload
mulpd XMM5, XMMWORD PTR [RIP + .LCPI2_3]
addpd XMM5, XMM7
movapd XMM7, XMMWORD PTR [RSP + 208] # 16-byte Reload
mulpd XMM7, XMMWORD PTR [RIP + .LCPI2_4]
addpd XMM7, XMM5
movapd XMM5, XMMWORD PTR [RSP + 224] # 16-byte Reload
mulpd XMM5, XMMWORD PTR [RIP + .LCPI2_5]
addpd XMM5, XMM7
movapd XMM7, XMMWORD PTR [RSP + 240] # 16-byte Reload
mulpd XMM7, XMMWORD PTR [RIP + .LCPI2_6]
addpd XMM7, XMM5
movapd XMM5, XMMWORD PTR [RSP + 48] # 16-byte Reload
mulpd XMM5, XMMWORD PTR [RIP + .LCPI2_7]
addpd XMM5, XMM7
movapd XMM7, XMM5
unpckhpd XMM7, XMM7 # xmm7 = xmm7[1,1]
addsd XMM7, XMM5
unpcklpd XMM7, XMM9 # xmm7 = xmm7[0],xmm9[0]
movapd XMMWORD PTR [R12 + 80], XMM7
movapd XMM5, XMMWORD PTR [RSP + 304] # 16-byte Reload
mulpd XMM5, XMMWORD PTR [RIP + .LCPI2_8]
movapd XMMWORD PTR [RSP + 304], XMM5 # 16-byte Spill
movapd XMM5, XMMWORD PTR [RIP + .LCPI2_9]
mulpd XMM2, XMM5
addpd XMM2, XMMWORD PTR [RSP + 304] # 16-byte Folded Reload
movapd XMM7, XMMWORD PTR [RIP + .LCPI2_10]
movapd XMM9, XMMWORD PTR [RSP + 64] # 16-byte Reload
mulpd XMM9, XMM7
addpd XMM9, XMM2
movapd XMMWORD PTR [RSP + 64], XMM9 # 16-byte Spill
movapd XMM2, XMMWORD PTR [RSP + 288] # 16-byte Reload
mulpd XMM2, XMMWORD PTR [RIP + .LCPI2_11]
addpd XMM2, XMM9
movapd XMMWORD PTR [RSP + 288], XMM2 # 16-byte Spill
movapd XMM2, XMMWORD PTR [RSP + 256] # 16-byte Reload
mulpd XMM2, XMMWORD PTR [RIP + .LCPI2_12]
addpd XMM2, XMMWORD PTR [RSP + 288] # 16-byte Folded Reload
movapd XMMWORD PTR [RSP + 256], XMM2 # 16-byte Spill
movapd XMM2, XMMWORD PTR [RSP + 144] # 16-byte Reload
mulpd XMM2, XMMWORD PTR [RIP + .LCPI2_13]
addpd XMM2, XMMWORD PTR [RSP + 256] # 16-byte Folded Reload
movapd XMMWORD PTR [RSP + 144], XMM2 # 16-byte Spill
movapd XMM2, XMMWORD PTR [RIP + .LCPI2_14]
movapd XMM9, XMMWORD PTR [RSP + 128] # 16-byte Reload
mulpd XMM9, XMM2
addpd XMM9, XMMWORD PTR [RSP + 144] # 16-byte Folded Reload
movapd XMMWORD PTR [RSP + 128], XMM9 # 16-byte Spill
movapd XMM9, XMMWORD PTR [RIP + .LCPI2_15]
mulpd XMM3, XMM9
addpd XMM3, XMMWORD PTR [RSP + 128] # 16-byte Folded Reload
movapd XMM15, XMM3
unpckhpd XMM15, XMM15 # xmm15 = xmm15[1,1]
addsd XMM15, XMM3
mulpd XMM4, XMM5
addpd XMM4, XMMWORD PTR [RSP + 320] # 16-byte Folded Reload
movapd XMM3, XMMWORD PTR [RSP + 112] # 16-byte Reload
mulpd XMM3, XMM7
addpd XMM3, XMM4
movapd XMMWORD PTR [RSP + 112], XMM3 # 16-byte Spill
movapd XMM3, XMMWORD PTR [RSP + 272] # 16-byte Reload
mulpd XMM3, XMMWORD PTR [RIP + .LCPI2_11]
addpd XMM3, XMMWORD PTR [RSP + 112] # 16-byte Folded Reload
movapd XMMWORD PTR [RSP + 272], XMM3 # 16-byte Spill
movapd XMM3, XMMWORD PTR [RSP + 96] # 16-byte Reload
mulpd XMM3, XMMWORD PTR [RIP + .LCPI2_12]
addpd XMM3, XMMWORD PTR [RSP + 272] # 16-byte Folded Reload
movapd XMMWORD PTR [RSP + 96], XMM3 # 16-byte Spill
movapd XMM3, XMMWORD PTR [RSP + 80] # 16-byte Reload
mulpd XMM3, XMMWORD PTR [RIP + .LCPI2_13]
addpd XMM3, XMMWORD PTR [RSP + 96] # 16-byte Folded Reload
movapd XMMWORD PTR [RSP + 80], XMM3 # 16-byte Spill
movapd XMM3, XMMWORD PTR [RSP + 16] # 16-byte Reload
mulpd XMM3, XMM2
addpd XMM3, XMMWORD PTR [RSP + 80] # 16-byte Folded Reload
movapd XMMWORD PTR [RSP + 16], XMM3 # 16-byte Spill
mulpd XMM6, XMM9
addpd XMM6, XMM3
movapd XMM3, XMM6
unpckhpd XMM3, XMM3 # xmm3 = xmm3[1,1]
addsd XMM3, XMM6
unpcklpd XMM3, XMM15 # xmm3 = xmm3[0],xmm15[0]
movapd XMMWORD PTR [R15], XMM3
movapd XMM15, XMMWORD PTR [RSP + 32] # 16-byte Reload
mulpd XMM15, XMMWORD PTR [RIP + .LCPI2_8]
movapd XMMWORD PTR [RSP + 32], XMM15 # 16-byte Spill
mulpd XMM8, XMM5
addpd XMM8, XMM15
mulpd XMM14, XMM7
addpd XMM14, XMM8
mulpd XMM13, XMMWORD PTR [RIP + .LCPI2_11]
addpd XMM13, XMM14
mulpd XMM12, XMMWORD PTR [RIP + .LCPI2_12]
addpd XMM12, XMM13
mulpd XMM11, XMMWORD PTR [RIP + .LCPI2_13]
addpd XMM11, XMM12
mulpd XMM10, XMM2
addpd XMM10, XMM11
mulpd XMM0, XMM9
addpd XMM0, XMM10
movapd XMM3, XMM0
unpckhpd XMM3, XMM3 # xmm3 = xmm3[1,1]
addsd XMM3, XMM0
movapd XMM0, XMMWORD PTR [RSP + 176] # 16-byte Reload
mulpd XMM0, XMMWORD PTR [RIP + .LCPI2_8]
movapd XMMWORD PTR [RSP + 176], XMM0 # 16-byte Spill
mulpd XMM1, XMM5
addpd XMM1, XMM0
movapd XMM15, XMMWORD PTR [RSP + 160] # 16-byte Reload
mulpd XMM15, XMM7
addpd XMM15, XMM1
movapd XMMWORD PTR [RSP + 160], XMM15 # 16-byte Spill
movapd XMM14, XMMWORD PTR [RSP + 192] # 16-byte Reload
mulpd XMM14, XMMWORD PTR [RIP + .LCPI2_11]
addpd XMM14, XMM15
movapd XMMWORD PTR [RSP + 192], XMM14 # 16-byte Spill
movapd XMM13, XMMWORD PTR [RSP + 208] # 16-byte Reload
mulpd XMM13, XMMWORD PTR [RIP + .LCPI2_12]
addpd XMM13, XMM14
movapd XMMWORD PTR [RSP + 208], XMM13 # 16-byte Spill
movapd XMM12, XMMWORD PTR [RSP + 224] # 16-byte Reload
mulpd XMM12, XMMWORD PTR [RIP + .LCPI2_13]
addpd XMM12, XMM13
movapd XMMWORD PTR [RSP + 224], XMM12 # 16-byte Spill
movapd XMM11, XMMWORD PTR [RSP + 240] # 16-byte Reload
mulpd XMM11, XMM2
addpd XMM11, XMM12
movapd XMMWORD PTR [RSP + 240], XMM11 # 16-byte Spill
movapd XMM10, XMMWORD PTR [RSP + 48] # 16-byte Reload
mulpd XMM10, XMM9
addpd XMM10, XMM11
movapd XMM0, XMM10
unpckhpd XMM0, XMM0 # xmm0 = xmm0[1,1]
addsd XMM0, XMM10
unpcklpd XMM0, XMM3 # xmm0 = xmm0[0],xmm3[0]
movapd XMMWORD PTR [R14], XMM0
inc R13
cmp R13, QWORD PTR [RSP + 480]
jb .LBB2_1
# BB#2: # %SyncBB29
add RSP, 344
pop RBX
pop R12
pop R13
pop R14
pop R15
pop RBP
ret
.Ltmp2:
.size magic_filter5, .Ltmp2-magic_filter5
As can be seen, the OpenCL version is spilling a lot of registers. This effect halves the performances.
Am I doing something wrong? I would have expected code written in intrinsics to port quite straightforwardly to OpenCL vectorized language. Even though I was curious to see if the compiler would optimize the transposition using haddpd, I did not expect such differences.
Sorry for the long post with lots of code but I could not reduce further.


