The code is compiled using MSVC2010 SP1, with /arch:AVX, and the AVX version is slightly (5~10%) slower than the SSE version. I am using an E-1230 V2 processor with 16GB dual-channel DDR3-1600 memory.
Both functions read 416 (9 float point vectors of length 8, and another 4 float vectors of length 8) byte data from memory, and return a float value, there is no memory store involved. The compiled SSE version has 111 instructions, and the AVX version has 67 instructions. All memory visits are aligned (16-byte for SSE, 32-byte for AVX). The difference between the two versions is only that the SSE version process 4 floating points in each instruction, so need two instructions for a length 8 vector, while the AVX version process 8 floating points in each instruction.
The AVX version should be at least as fast as the SSE version even if the program is memory-bound, but it turns out the AVX version is slower. The code is the core in an image processing program, the SSE version processes the image in ~180 ms, but the AVX version takes about ~200 ms. The function is called for about 2M times in processing an image, with different inputs.
The code is as follows.
SSE version:
float _SURFEvalProjection_2C2R_Fast(SURFWeakClassifier * ptrWeak, int pixOffset)
{
SURFPixData * ptr_x0y0 = ptrWeak->Feature.ptrPtOffsets[0] + pixOffset;
SURFPixData * ptr_x0y1 = ptrWeak->Feature.ptrPtOffsets[1] + pixOffset;
SURFPixData * ptr_x0y2 = ptrWeak->Feature.ptrPtOffsets[2] + pixOffset;
SURFPixData * ptr_x1y0 = ptrWeak->Feature.ptrPtOffsets[3] + pixOffset;
SURFPixData * ptr_x1y1 = ptrWeak->Feature.ptrPtOffsets[4] + pixOffset;
SURFPixData * ptr_x1y2 = ptrWeak->Feature.ptrPtOffsets[5] + pixOffset;
SURFPixData * ptr_x2y0 = ptrWeak->Feature.ptrPtOffsets[6] + pixOffset;
SURFPixData * ptr_x2y1 = ptrWeak->Feature.ptrPtOffsets[7] + pixOffset;
SURFPixData * ptr_x2y2 = ptrWeak->Feature.ptrPtOffsets[8] + pixOffset;
__m128 dp_4, dp_8;
__m128 tmp40, tmp41, tmp42, tmp43, tmp80, tmp81, tmp82, tmp83;
tmp40 = _mm_sub_ps(_mm_add_ps(ptr_x1y1->datam128[0], ptr_x0y0->datam128[0]), _mm_add_ps(ptr_x1y0->datam128[0], ptr_x0y1->datam128[0]));
tmp41 = _mm_sub_ps(_mm_add_ps(ptr_x2y1->datam128[0], ptr_x1y0->datam128[0]), _mm_add_ps(ptr_x2y0->datam128[0], ptr_x1y1->datam128[0]));
tmp42 = _mm_sub_ps(_mm_add_ps(ptr_x1y2->datam128[0], ptr_x0y1->datam128[0]), _mm_add_ps(ptr_x1y1->datam128[0], ptr_x0y2->datam128[0]));
tmp43 = _mm_sub_ps(_mm_add_ps(ptr_x2y2->datam128[0], ptr_x1y1->datam128[0]), _mm_add_ps(ptr_x2y1->datam128[0], ptr_x1y2->datam128[0]));
tmp80 = _mm_sub_ps(_mm_add_ps(ptr_x1y1->datam128[1], ptr_x0y0->datam128[1]), _mm_add_ps(ptr_x1y0->datam128[1], ptr_x0y1->datam128[1]));
tmp81 = _mm_sub_ps(_mm_add_ps(ptr_x2y1->datam128[1], ptr_x1y0->datam128[1]), _mm_add_ps(ptr_x2y0->datam128[1], ptr_x1y1->datam128[1]));
tmp82 = _mm_sub_ps(_mm_add_ps(ptr_x1y2->datam128[1], ptr_x0y1->datam128[1]), _mm_add_ps(ptr_x1y1->datam128[1], ptr_x0y2->datam128[1]));
tmp83 = _mm_sub_ps(_mm_add_ps(ptr_x2y2->datam128[1], ptr_x1y1->datam128[1]), _mm_add_ps(ptr_x2y1->datam128[1], ptr_x1y2->datam128[1]));
// Calculate the inner product, add eps, and rsqrt.
dp_4 = _mm_add_ps(_mm_add_ps(_mm_dp_ps(tmp40, tmp40, 255), _mm_dp_ps(tmp41, tmp41, 255)),
_mm_add_ps(_mm_dp_ps(tmp42, tmp42, 255), _mm_dp_ps(tmp43, tmp43, 255)));
dp_8 = _mm_add_ps(_mm_add_ps(_mm_dp_ps(tmp80, tmp80, 255), _mm_dp_ps(tmp81, tmp81, 255)),
_mm_add_ps(_mm_dp_ps(tmp82, tmp82, 255), _mm_dp_ps(tmp83, tmp83, 255)));
dp_4 = _mm_add_ps(dp_4, dp_8);
__m128 m128_eps = _mm_set_ps(0.00000001, 0.00000001, 0.00000001, 0.00000001);
dp_4 = _mm_add_ps(dp_4, m128_eps);
dp_4 = _mm_rsqrt_ps(dp_4);
// Normalize and inner prod with the projections.
__m128 res0 = _mm_mul_ps(_mm_load_ps(ptrWeak->Projection), tmp40);
__m128 res1 = _mm_mul_ps(_mm_load_ps(ptrWeak->Projection+8), tmp41);
__m128 res2 = _mm_mul_ps(_mm_load_ps(ptrWeak->Projection+16), tmp42);
__m128 res3 = _mm_mul_ps(_mm_load_ps(ptrWeak->Projection+24), tmp43);
__m128 res4 = _mm_mul_ps(_mm_load_ps(ptrWeak->Projection+4), tmp80);
__m128 res5 = _mm_mul_ps(_mm_load_ps(ptrWeak->Projection+12), tmp81);
__m128 res6 = _mm_mul_ps(_mm_load_ps(ptrWeak->Projection+20), tmp82);
__m128 res7 = _mm_mul_ps(_mm_load_ps(ptrWeak->Projection+28), tmp83);
res0 = _mm_add_ps(_mm_add_ps(res0, res1), _mm_add_ps(res2, res3));
res1 = _mm_add_ps(_mm_add_ps(res4, res5), _mm_add_ps(res6, res7));
res0 = _mm_add_ps(res0, res1);
res0 = _mm_mul_ps(res0, dp_4);
__m128 m128_zero = _mm_setzero_ps();
res0 = _mm_hadd_ps(m128_zero, res0);
res0 = _mm_hadd_ps(m128_zero, res0);
return res0.m128_f32[3];
}
AVX version:
float _SURFEvalProjection_2C2R_Fast_AVX(SURFWeakClassifier * ptrWeak, int pixOffset)
{
SURFPixData * ptr_x0y0 = ptrWeak->Feature.ptrPtOffsets[0] + pixOffset;
SURFPixData * ptr_x0y1 = ptrWeak->Feature.ptrPtOffsets[1] + pixOffset;
SURFPixData * ptr_x0y2 = ptrWeak->Feature.ptrPtOffsets[2] + pixOffset;
SURFPixData * ptr_x1y0 = ptrWeak->Feature.ptrPtOffsets[3] + pixOffset;
SURFPixData * ptr_x1y1 = ptrWeak->Feature.ptrPtOffsets[4] + pixOffset;
SURFPixData * ptr_x1y2 = ptrWeak->Feature.ptrPtOffsets[5] + pixOffset;
SURFPixData * ptr_x2y0 = ptrWeak->Feature.ptrPtOffsets[6] + pixOffset;
SURFPixData * ptr_x2y1 = ptrWeak->Feature.ptrPtOffsets[7] + pixOffset;
SURFPixData * ptr_x2y2 = ptrWeak->Feature.ptrPtOffsets[8] + pixOffset;
__m256 dp; // Dot product.
__m256 tmp0, tmp1, tmp2, tmp3;
tmp0 = _mm256_sub_ps(_mm256_add_ps(ptr_x1y1->datam256, ptr_x0y0->datam256), _mm256_add_ps(ptr_x1y0->datam256, ptr_x0y1->datam256));
tmp1 = _mm256_sub_ps(_mm256_add_ps(ptr_x2y1->datam256, ptr_x1y0->datam256), _mm256_add_ps(ptr_x2y0->datam256, ptr_x1y1->datam256));
tmp2 = _mm256_sub_ps(_mm256_add_ps(ptr_x1y2->datam256, ptr_x0y1->datam256), _mm256_add_ps(ptr_x1y1->datam256, ptr_x0y2->datam256));
tmp3 = _mm256_sub_ps(_mm256_add_ps(ptr_x2y2->datam256, ptr_x1y1->datam256), _mm256_add_ps(ptr_x2y1->datam256, ptr_x1y2->datam256));
dp = _mm256_add_ps(_mm256_add_ps(_mm256_dp_ps(tmp0, tmp0, 255), _mm256_dp_ps(tmp1, tmp1, 255)),
_mm256_add_ps(_mm256_dp_ps(tmp2, tmp2, 255), _mm256_dp_ps(tmp3, tmp3, 255)));
dp = _mm256_add_ps(dp, _mm256_permute2f128_ps(dp, dp, 3 | 0<<4)); // Now 7 and 3 are reversed.
__m256 m256_eps = _mm256_set_ps(0.00000001, 0.00000001, 0.00000001, 0.00000001, 0.00000001, 0.00000001, 0.00000001, 0.00000001);
dp = _mm256_add_ps(dp, m256_eps);
dp = _mm256_rsqrt_ps(dp);
// Normalize and inner prod with the projections.
__m256 res0 = _mm256_mul_ps(_mm256_load_ps(ptrWeak->Projection), tmp0);
__m256 res1 = _mm256_mul_ps(_mm256_load_ps(ptrWeak->Projection+8), tmp1);
__m256 res2 = _mm256_mul_ps(_mm256_load_ps(ptrWeak->Projection+16), tmp2);
__m256 res3 = _mm256_mul_ps(_mm256_load_ps(ptrWeak->Projection+24), tmp3);
res0 = _mm256_add_ps(_mm256_add_ps(res0, res1), _mm256_add_ps(res2, res3));
res0 = _mm256_mul_ps(res0, dp);
__m256 m256_zero = _mm256_setzero_ps();
res0 = _mm256_hadd_ps(m256_zero, res0);
res0 = _mm256_hadd_ps(m256_zero, res0);
res0 = _mm256_add_ps(res0, _mm256_permute2f128_ps(res0, res0, 3 | 0<<4));
return res0.m256_f32[7];
}