missed optimization on SSE multiply-add loop

Matthias Kretz
Total Points:
310
Status Points:
260
Green Belt
July 9, 2009 9:51 AM PDT
Rate
 
#2 Reply to #1

It would be nice if you can help us with code that we can compile. Also please help us with compiler options.

Sure. The full thing can be found at http://gitorious.org/vc. The part of the testcase that is relevant here is:
#include "benchmark.h"
#include <cstdio>     
#include <cstdlib>    

static const int factor = 1000000;

static float randomF(float min, float max)
{                                         
    const float delta = max - min;        
    return min + delta * rand() / RAND_MAX;
}                                          

static float randomF12() { return randomF(1.f, 2.f); }

int main()
{         
    int blackHole = true;
    {           
        Benchmark timer("SAXPY (reference)", 8. * float_v::Size * factor, "FLOP");
        for (int repetitions = 0; repetitions < 10; ++repetitions) {              
#ifdef USE_SSE                                                                    
            __m128 tmp = _mm_set1_ps(static_cast<float>(repetitions));            
            const __m128 oPoint2 = _mm_set1_ps(randomF(.1f, .2f));                
            const __m128 oPoint1 = _mm_set1_ps(randomF(.1f, .2f));                
            const __m128 alpha[4] = {                                             
                _mm_add_ps(tmp, oPoint2),                                         
                _mm_sub_ps(tmp, oPoint2),                                         
                _mm_add_ps(tmp, oPoint1),                                         
                _mm_sub_ps(tmp, oPoint1)                                          
            };                                                                    
            __m128 x[4] = { _mm_set1_ps(randomF12()), _mm_set1_ps(randomF12()), _mm_set1_ps(randomF12()), _mm_set1_ps(randomF12()) };
            const __m128 y[4] = { _mm_set1_ps(randomF12()), _mm_set1_ps(randomF12()), _mm_set1_ps(randomF12()), _mm_set1_ps(randomF12()) };

            timer.Start();
            ///////////////////////////////////////

            for (int i = 0; i < factor; ++i) {
                    x[0] = _mm_add_ps(_mm_mul_ps(alpha[0], x[0]), y[0]);
                    x[1] = _mm_add_ps(_mm_mul_ps(alpha[1], x[1]), y[1]);
                    x[2] = _mm_add_ps(_mm_mul_ps(alpha[2], x[2]), y[2]);
                    x[3] = _mm_add_ps(_mm_mul_ps(alpha[3], x[3]), y[3]);
            }                                                           

            ///////////////////////////////////////
            timer.Stop();                          

            const int k = _mm_movemask_ps(_mm_add_ps(_mm_add_ps(x[0], x[1]), _mm_add_ps(x[2], x[3])));
            blackHole &= k;
        }
        timer.Print(Benchmark::PrintAverage);
    }
    if (blackHole != 0) {
        std::cout << std::endl;
    }
    return 0;
}
Find benchmark.h at http://gitorious.org/vc/vc/blobs/master/benchmarks/benchmark.h.

gcc compiles with -O3
icc compiles with -mieee-fp -O3 -ansi-alias


Intel Software Network Forums Statistics

8470 users have contributed to 31601 threads and 100650 posts to date.
In the past 24 hours, we have 29 new thread(s) 115 new posts(s), and 162 new user(s).

In the past 3 days, the most popular thread for everyone has been gemm(A,A,A) like possible? The most posts were made to gemm(A,A,A) like possible? The post with the most views is Dear Steve, excuse me for a d

Please welcome our newest member kopernikus