It would be nice if you can help us with code that we can compile. Also please help us with compiler options.
Sure. The full thing can be found at http://gitorious.org/vc. The part of the testcase that is relevant here is:
#include "benchmark.h"
#include <cstdio>
#include <cstdlib>
static const int factor = 1000000;
static float randomF(float min, float max)
{
const float delta = max - min;
return min + delta * rand() / RAND_MAX;
}
static float randomF12() { return randomF(1.f, 2.f); }
int main()
{
int blackHole = true;
{
Benchmark timer("SAXPY (reference)", 8. * float_v::Size * factor, "FLOP");
for (int repetitions = 0; repetitions < 10; ++repetitions) {
#ifdef USE_SSE
__m128 tmp = _mm_set1_ps(static_cast<float>(repetitions));
const __m128 oPoint2 = _mm_set1_ps(randomF(.1f, .2f));
const __m128 oPoint1 = _mm_set1_ps(randomF(.1f, .2f));
const __m128 alpha[4] = {
_mm_add_ps(tmp, oPoint2),
_mm_sub_ps(tmp, oPoint2),
_mm_add_ps(tmp, oPoint1),
_mm_sub_ps(tmp, oPoint1)
};
__m128 x[4] = { _mm_set1_ps(randomF12()), _mm_set1_ps(randomF12()), _mm_set1_ps(randomF12()), _mm_set1_ps(randomF12()) };
const __m128 y[4] = { _mm_set1_ps(randomF12()), _mm_set1_ps(randomF12()), _mm_set1_ps(randomF12()), _mm_set1_ps(randomF12()) };
timer.Start();
///////////////////////////////////////
for (int i = 0; i < factor; ++i) {
x[0] = _mm_add_ps(_mm_mul_ps(alpha[0], x[0]), y[0]);
x[1] = _mm_add_ps(_mm_mul_ps(alpha[1], x[1]), y[1]);
x[2] = _mm_add_ps(_mm_mul_ps(alpha[2], x[2]), y[2]);
x[3] = _mm_add_ps(_mm_mul_ps(alpha[3], x[3]), y[3]);
}
///////////////////////////////////////
timer.Stop();
const int k = _mm_movemask_ps(_mm_add_ps(_mm_add_ps(x[0], x[1]), _mm_add_ps(x[2], x[3])));
blackHole &= k;
}
timer.Print(Benchmark::PrintAverage);
}
if (blackHole != 0) {
std::cout << std::endl;
}
return 0;
}
Find benchmark.h at http://gitorious.org/vc/vc/blobs/master/benchmarks/benchmark.h.
gcc compiles with -O3
icc compiles with -mieee-fp -O3 -ansi-alias