The code is compiled using MSVC2010 SP1, with /arch:AVX, and the AVX version is slightly (5~10%) slower than the SSE version. I am using an E-1230 V2 processor with 16GB dual-channel DDR3-1600 memory.

Both functions read 416 (9 float point vectors of length 8, and another 4 float vectors of length 8) byte data from memory, and return a float value, there is no memory store involved. The compiled SSE version has 111 instructions, and the AVX version has 67 instructions. All memory visits are aligned (16-byte for SSE, 32-byte for AVX). The difference between the two versions is only that the SSE version process 4 floating points in each instruction, so need two instructions for a length 8 vector, while the AVX version process 8 floating points in each instruction.

The AVX version should be at least as fast as the SSE version even if the program is memory-bound, but it turns out the AVX version is slower. The code is the core in an image processing program, the SSE version processes the image in ~180 ms, but the AVX version takes about ~200 ms. The function is called for about 2M times in processing an image, with different inputs.

The code is as follows.

SSE version:

float _SURFEvalProjection_2C2R_Fast(SURFWeakClassifier * ptrWeak, int pixOffset)

{

SURFPixData * ptr_x0y0 = ptrWeak->Feature.ptrPtOffsets[0] + pixOffset;

SURFPixData * ptr_x0y1 = ptrWeak->Feature.ptrPtOffsets[1] + pixOffset;

SURFPixData * ptr_x0y2 = ptrWeak->Feature.ptrPtOffsets[2] + pixOffset;

SURFPixData * ptr_x1y0 = ptrWeak->Feature.ptrPtOffsets[3] + pixOffset;

SURFPixData * ptr_x1y1 = ptrWeak->Feature.ptrPtOffsets[4] + pixOffset;

SURFPixData * ptr_x1y2 = ptrWeak->Feature.ptrPtOffsets[5] + pixOffset;

SURFPixData * ptr_x2y0 = ptrWeak->Feature.ptrPtOffsets[6] + pixOffset;

SURFPixData * ptr_x2y1 = ptrWeak->Feature.ptrPtOffsets[7] + pixOffset;

SURFPixData * ptr_x2y2 = ptrWeak->Feature.ptrPtOffsets[8] + pixOffset;

__m128 dp_4, dp_8;

__m128 tmp40, tmp41, tmp42, tmp43, tmp80, tmp81, tmp82, tmp83;

tmp40 = _mm_sub_ps(_mm_add_ps(ptr_x1y1->datam128[0], ptr_x0y0->datam128[0]), _mm_add_ps(ptr_x1y0->datam128[0], ptr_x0y1->datam128[0]));

tmp41 = _mm_sub_ps(_mm_add_ps(ptr_x2y1->datam128[0], ptr_x1y0->datam128[0]), _mm_add_ps(ptr_x2y0->datam128[0], ptr_x1y1->datam128[0]));

tmp42 = _mm_sub_ps(_mm_add_ps(ptr_x1y2->datam128[0], ptr_x0y1->datam128[0]), _mm_add_ps(ptr_x1y1->datam128[0], ptr_x0y2->datam128[0]));

tmp43 = _mm_sub_ps(_mm_add_ps(ptr_x2y2->datam128[0], ptr_x1y1->datam128[0]), _mm_add_ps(ptr_x2y1->datam128[0], ptr_x1y2->datam128[0]));

tmp80 = _mm_sub_ps(_mm_add_ps(ptr_x1y1->datam128[1], ptr_x0y0->datam128[1]), _mm_add_ps(ptr_x1y0->datam128[1], ptr_x0y1->datam128[1]));

tmp81 = _mm_sub_ps(_mm_add_ps(ptr_x2y1->datam128[1], ptr_x1y0->datam128[1]), _mm_add_ps(ptr_x2y0->datam128[1], ptr_x1y1->datam128[1]));

tmp82 = _mm_sub_ps(_mm_add_ps(ptr_x1y2->datam128[1], ptr_x0y1->datam128[1]), _mm_add_ps(ptr_x1y1->datam128[1], ptr_x0y2->datam128[1]));

tmp83 = _mm_sub_ps(_mm_add_ps(ptr_x2y2->datam128[1], ptr_x1y1->datam128[1]), _mm_add_ps(ptr_x2y1->datam128[1], ptr_x1y2->datam128[1]));

// Calculate the inner product, add eps, and rsqrt.

dp_4 = _mm_add_ps(_mm_add_ps(_mm_dp_ps(tmp40, tmp40, 255), _mm_dp_ps(tmp41, tmp41, 255)),

_mm_add_ps(_mm_dp_ps(tmp42, tmp42, 255), _mm_dp_ps(tmp43, tmp43, 255)));

dp_8 = _mm_add_ps(_mm_add_ps(_mm_dp_ps(tmp80, tmp80, 255), _mm_dp_ps(tmp81, tmp81, 255)),

_mm_add_ps(_mm_dp_ps(tmp82, tmp82, 255), _mm_dp_ps(tmp83, tmp83, 255)));

dp_4 = _mm_add_ps(dp_4, dp_8);

__m128 m128_eps = _mm_set_ps(0.00000001, 0.00000001, 0.00000001, 0.00000001);

dp_4 = _mm_add_ps(dp_4, m128_eps);

dp_4 = _mm_rsqrt_ps(dp_4);

// Normalize and inner prod with the projections.

__m128 res0 = _mm_mul_ps(_mm_load_ps(ptrWeak->Projection), tmp40);

__m128 res1 = _mm_mul_ps(_mm_load_ps(ptrWeak->Projection+8), tmp41);

__m128 res2 = _mm_mul_ps(_mm_load_ps(ptrWeak->Projection+16), tmp42);

__m128 res3 = _mm_mul_ps(_mm_load_ps(ptrWeak->Projection+24), tmp43);

__m128 res4 = _mm_mul_ps(_mm_load_ps(ptrWeak->Projection+4), tmp80);

__m128 res5 = _mm_mul_ps(_mm_load_ps(ptrWeak->Projection+12), tmp81);

__m128 res6 = _mm_mul_ps(_mm_load_ps(ptrWeak->Projection+20), tmp82);

__m128 res7 = _mm_mul_ps(_mm_load_ps(ptrWeak->Projection+28), tmp83);

res0 = _mm_add_ps(_mm_add_ps(res0, res1), _mm_add_ps(res2, res3));

res1 = _mm_add_ps(_mm_add_ps(res4, res5), _mm_add_ps(res6, res7));

res0 = _mm_add_ps(res0, res1);

res0 = _mm_mul_ps(res0, dp_4);

__m128 m128_zero = _mm_setzero_ps();

res0 = _mm_hadd_ps(m128_zero, res0);

res0 = _mm_hadd_ps(m128_zero, res0);

return res0.m128_f32[3];

}

AVX version:

float _SURFEvalProjection_2C2R_Fast_AVX(SURFWeakClassifier * ptrWeak, int pixOffset)

{

SURFPixData * ptr_x0y0 = ptrWeak->Feature.ptrPtOffsets[0] + pixOffset;

SURFPixData * ptr_x0y1 = ptrWeak->Feature.ptrPtOffsets[1] + pixOffset;

SURFPixData * ptr_x0y2 = ptrWeak->Feature.ptrPtOffsets[2] + pixOffset;

SURFPixData * ptr_x1y0 = ptrWeak->Feature.ptrPtOffsets[3] + pixOffset;

SURFPixData * ptr_x1y1 = ptrWeak->Feature.ptrPtOffsets[4] + pixOffset;

SURFPixData * ptr_x1y2 = ptrWeak->Feature.ptrPtOffsets[5] + pixOffset;

SURFPixData * ptr_x2y0 = ptrWeak->Feature.ptrPtOffsets[6] + pixOffset;

SURFPixData * ptr_x2y1 = ptrWeak->Feature.ptrPtOffsets[7] + pixOffset;

SURFPixData * ptr_x2y2 = ptrWeak->Feature.ptrPtOffsets[8] + pixOffset;

__m256 dp; // Dot product.

__m256 tmp0, tmp1, tmp2, tmp3;

tmp0 = _mm256_sub_ps(_mm256_add_ps(ptr_x1y1->datam256, ptr_x0y0->datam256), _mm256_add_ps(ptr_x1y0->datam256, ptr_x0y1->datam256));

tmp1 = _mm256_sub_ps(_mm256_add_ps(ptr_x2y1->datam256, ptr_x1y0->datam256), _mm256_add_ps(ptr_x2y0->datam256, ptr_x1y1->datam256));

tmp2 = _mm256_sub_ps(_mm256_add_ps(ptr_x1y2->datam256, ptr_x0y1->datam256), _mm256_add_ps(ptr_x1y1->datam256, ptr_x0y2->datam256));

tmp3 = _mm256_sub_ps(_mm256_add_ps(ptr_x2y2->datam256, ptr_x1y1->datam256), _mm256_add_ps(ptr_x2y1->datam256, ptr_x1y2->datam256));

dp = _mm256_add_ps(_mm256_add_ps(_mm256_dp_ps(tmp0, tmp0, 255), _mm256_dp_ps(tmp1, tmp1, 255)),

_mm256_add_ps(_mm256_dp_ps(tmp2, tmp2, 255), _mm256_dp_ps(tmp3, tmp3, 255)));

dp = _mm256_add_ps(dp, _mm256_permute2f128_ps(dp, dp, 3 | 0<<4)); // Now 7 and 3 are reversed.

__m256 m256_eps = _mm256_set_ps(0.00000001, 0.00000001, 0.00000001, 0.00000001, 0.00000001, 0.00000001, 0.00000001, 0.00000001);

dp = _mm256_add_ps(dp, m256_eps);

dp = _mm256_rsqrt_ps(dp);

// Normalize and inner prod with the projections.

__m256 res0 = _mm256_mul_ps(_mm256_load_ps(ptrWeak->Projection), tmp0);

__m256 res1 = _mm256_mul_ps(_mm256_load_ps(ptrWeak->Projection+8), tmp1);

__m256 res2 = _mm256_mul_ps(_mm256_load_ps(ptrWeak->Projection+16), tmp2);

__m256 res3 = _mm256_mul_ps(_mm256_load_ps(ptrWeak->Projection+24), tmp3);

res0 = _mm256_add_ps(_mm256_add_ps(res0, res1), _mm256_add_ps(res2, res3));

res0 = _mm256_mul_ps(res0, dp);

__m256 m256_zero = _mm256_setzero_ps();

res0 = _mm256_hadd_ps(m256_zero, res0);

res0 = _mm256_hadd_ps(m256_zero, res0);

res0 = _mm256_add_ps(res0, _mm256_permute2f128_ps(res0, res0, 3 | 0<<4));

return res0.m256_f32[7];

}