Dear All,

I have the following code :

for(k = 0; k < MAX_ITER; k++){

#pragma omp parallel for private(i,j,sum)

for(i=0; i<N; i++){

sum=0.0;

sum = sum-A[i*N+i] * X[i];

for(j=0; j<N; j++)

sum += A[i*N+j] * X[j];

new_x[i] = (B[i] - sum)/A[i*N+i];

}

#pragma omp parallel for private (i)

for(i=0; i < N; i++)

X[i] = new_x[i];

}

I optimized it to remove the overhead of the Pragma and the implicit barrier to the following code

#pragma omp parallel shared (A,B,X,N,T,kk) private(k,i,ii,j,sum,Xnew_sub)

{

Xnew_sub=(double*)malloc((kk)*sizeof(double));

int tid=omp_get_thread_num();

ii=tid*kk;

for(k=0;k<MAX_ITER;k++)

{

for(i=0; i<kk; i++){

sum=0.0;

sum = sum-(A[(i+ii)*N+i+ii]*X[i+ii]);

for(j=0; j<N; j++){

sum += A[(i+ii)*N+j] * X[j];

}

Xnew_sub[i]= (B[i+ii] - sum)/A[(i+ii)*N+i+ii];

}

#pragma omp barrier

for(i=0;i<kk;i++){

X[i+ii]=Xnew_sub[i];

}

I run it on MIC , the problem is there is no optimization. In contrast the second code have more execution time. I use the Intel Compiler version 13 upate 1 with the defualt optimization option. How can I know what is the optimization the compiler did implicitly on the code.

}//end of the iteration MAX_ITER