OpenMP breaks auto-vectorization

hpcmango
Total Points:
135
Status Points:
85
Green Belt
June 26, 2009 8:34 AM PDT
Rate
 
#2 Reply to #1
Thanks for the answer.

Unfortunately upgrading to 11.1 will take a week or so, because I'm not the admin on the machine with icc. I will try it of course, when the upgrade is done.

I tried -inline-max-size=50 with version 11.0 though, but it didn't help. Anyway I am wondering how this could have an effect, as it seems to be about function inlining and my code only has a main.

While playing around, I found another way to get it to vectorize. I am using a timer c++ class (which simply wraps the posix high resolution timers for convenience) to measure time. If I remove the usage of this class (and inline the timer code instead into main) vectorization works also with OpenMP.

Really strange. The timer code is completely outside the region of interest. Is it possible that OpenMP or vectorization doesn't like object oriented programming and shuts done completely when using it?

-- Main.cxx ---
#include <stdio.h>
#include <stdlib.h>

#include <omp.h>

#include "Timer.hxx"

int main()
{
#pragma omp parallel
{
printf("OpenMP thread = %i/%i.\n",omp_get_thread_num(),omp_get_num_threads());
}

const int sizeX = 8192;
const int sizeY = 8192;
const int loops = 100;

float* __restrict dataA;
float* __restrict dataB;

int dataSize=sizeof(float)*sizeX*sizeY;

dataA=(float*)malloc(dataSize);
dataB=(float*)malloc(dataSize);

for(int i=0;i<sizeY;i++) {
for(int j=0;j<sizeX;j++) {
dataA[i*sizeX+j]=0;
}
}
dataA[(sizeY/2)*sizeX+(sizeX/2)]=1;

Timer timer;
for(int iLoop=0;iLoop<loops;iLoop++) {

#pragma omp parallel for
for(int i=1;i<sizeY-1;i++) {
int curIndex=1+i*sizeX;
for(int j=1;j<sizeX-1;j++) {
dataB[curIndex]=0.1*(dataA[curIndex-1]+dataA[curIndex+1]+dataA[curIndex-sizeX]+dataA[curIndex+sizeX])+0.6*dataA[curIndex];
curIndex++;
}
curIndex+=2;
}

#pragma omp parallel for
for(int i=1;i<sizeY-1;i++) {
int curIndex=1+i*sizeX;
for(int j=1;j<sizeX-1;j++) {
dataA[curIndex]=0.1*(dataB[curIndex-1]+dataB[curIndex+1]+dataB[curIndex-sizeX]+dataB[curIndex+sizeX])+0.6*dataB[curIndex];
curIndex++;
}
curIndex+=2;
}
}
double duration=timer.get();
fprintf(stderr,"Time = %g s, Performance = %g FLOPS\n",duration,6.*(sizeX-1)*(sizeY-1)*2*loops/duration);

fprintf(stderr,"\n");
for(int i=sizeY/2-5;i<=sizeY/2+5;i++) {
for(int j=sizeX/2-5;j<=sizeX/2+5;j++) {
fprintf(stderr,"%f ",dataA[i*sizeX+j]);
}
fprintf(stderr,"\n");
}

free(dataA);
free(dataB);

return 0;
}

--- Timer.hxx ---
#ifndef om_timer_hxx_
#define om_timer_hxx_

#include <time.h>

class Timer {
public:
Timer() {
reset();
}
void reset() {
clock_gettime(CLOCK_MONOTONIC,&m_Timespec);
}
double get() {
struct timespec endTimespec;
clock_gettime(CLOCK_MONOTONIC,&endTimespec);
return (endTimespec.tv_sec-m_Timespec.tv_sec)+
(endTimespec.tv_nsec-m_Timespec.tv_nsec)*1e-9;
}
double getAndReset() {
struct timespec endTimespec;
clock_gettime(CLOCK_MONOTONIC,&endTimespec);
double result=(endTimespec.tv_sec-m_Timespec.tv_sec)+
(endTimespec.tv_nsec-m_Timespec.tv_nsec)*1e-9;
m_Timespec=endTimespec;
return result;
}
private:
struct timespec m_Timespec;
};

#endif



Intel Software Network Forums Statistics

8472 users have contributed to 31603 threads and 100653 posts to date.
In the past 24 hours, we have 31 new thread(s) 112 new posts(s), and 166 new user(s).

In the past 3 days, the most popular thread for everyone has been gemm(A,A,A) like possible? The most posts were made to gemm(A,A,A) like possible? The post with the most views is Dear Steve, excuse me for a d

Please welcome our newest member Edwin B. Ramayya