I have a matrix solver (BiCCG) which is being used to solve set of algebraic equations arising from a 3 dimensional computational domain. I have tried to parallelise it using OpenMP but struggling with performance issues. On inspecting the code using Intel Advisor, it is evident that almost 80% of the solution time goes in the solver out of which there is one function which accounts for 50% of the solution time. Digging even deeper it is found that 5 loops out of 6 loops are performing terribly with no automatic vectorization since they suffer from data dependencies. What I do see is that there is a dependency (for eg in loop 3 ) because there i th iteration is using i-1 iteration's values.

How to change the design of the parallelisation such that it can be the most efficient with this algorithm ( rather that changing the algorithm altogether). Whether specifying #pragma omp simd safelen(1) would help.

The loops are as follows ( all loops except loop 5 are not automatically vectorised and are the sole bottlenecks of the function )

# pragma omp parallel num_threads(NTt) default(none) private(i,j,k, mythread, dummy) shared(STA,res_sparse_s,COEFF,p_sparse_s, ap_sparse_s,h_sparse_s,RLL, pipi_sparse, normres_sparse, riri_sparse,riri_sparse2,noemer_sparse, nx, ny, nz, nv, PeriodicBoundaryX, PeriodicBoundaryY, PeriodicBoundaryZ) { mythread = omp_get_thread_num();//0 // loop 1 #pragma omp for reduction(+:pipi_sparse) for (i=1; i<=nx; i++) for (j=1; j<=ny; j++) for (k=1; k<=nz; k++) { dummy = COEFF[i][j][k][6] * p_sparse_s[i][j][k]; if (PeriodicBoundaryX && i == 1) dummy += COEFF[i][j][k][0] * p_sparse_s[nx ][j][k]; else dummy += COEFF[i][j][k][0] * p_sparse_s[i-1][j][k]; if (PeriodicBoundaryX && i == nx) dummy += COEFF[i][j][k][1] * p_sparse_s[1 ][j][k]; else dummy += COEFF[i][j][k][1] * p_sparse_s[i+1][j][k]; if (PeriodicBoundaryY && j == 1) dummy += COEFF[i][j][k][2] * p_sparse_s[i][ny ][k]; else dummy += COEFF[i][j][k][2] * p_sparse_s[i][j-1][k]; if (PeriodicBoundaryY && j == ny) dummy += COEFF[i][j][k][3] * p_sparse_s[i][ 1][k]; else dummy += COEFF[i][j][k][3] * p_sparse_s[i][j+1][k]; if (PeriodicBoundaryZ && k == 1) dummy += COEFF[i][j][k][4] * p_sparse_s[i][j][nz ]; else dummy += COEFF[i][j][k][4] * p_sparse_s[i][j][k-1]; if (PeriodicBoundaryZ && k == nz) dummy += COEFF[i][j][k][5] * p_sparse_s[i][j][ 1]; else dummy += COEFF[i][j][k][5] * p_sparse_s[i][j][k+1]; ap_sparse_s[i][j][k] = dummy; pipi_sparse += p_sparse_s[i][j][k] * ap_sparse_s[i][j][k]; } // loop 2 #pragma omp for reduction(+:normres_sparse) for (i=1; i<=nx; i++) for (j=1; j<=ny; j++) for (k=1; k<=nz; k++) { STA[i][j][k] += (riri_sparse / pipi_sparse) * p_sparse_s[i][j][k]; res_sparse_s[i][j][k] -= (riri_sparse / pipi_sparse) * ap_sparse_s[i][j][k]; normres_sparse += (res_sparse_s[i][j][k] * res_sparse_s[i][j][k])/ nv;// need to take square root separately } // loop 3 // FORWARD #pragma omp for schedule(static, nx/NTt) for (i=1; i<=nx; i++) for (j=1; j<=ny; j++) for (k=1; k<=nz; k++) { dummy = res_sparse_s[i][j][k]; dummy -= COEFF[i][j][k][7] * RLL[i-1][j][k]; if (PeriodicBoundaryX && i==nx)dummy -= COEFF[i][j][k][8] * RLL[1 ][j][k]; dummy -= COEFF[i][j][k][2] * RLL[i][j-1][k]; if (PeriodicBoundaryY && j==ny) dummy -= COEFF[i][j][k][3] * RLL[i][1 ][k]; dummy -= COEFF[i][j][k][4] * RLL[i][j][k-1]; if (PeriodicBoundaryZ && k==nz) dummy -= COEFF[i][j][k][5] * RLL[i][j][1 ]; RLL[i][j][k] = dummy / h_sparse_s[i][j][k]; } // loop 4 // BACKWARD #pragma omp for schedule(static, nx/NTt) for (i=nx; i>=1;i--) for (j=ny; j>=1;j--) for (k=nz; k>=1;k--) { dummy = RLL[i][j][k]*h_sparse_s[i][j][k]; if (PeriodicBoundaryX && i==1) dummy -= COEFF[i][j][k][7] * RLL[nx ][j][k]; dummy -= COEFF[i][j][k][8] * RLL[i+1][j][k]; if (PeriodicBoundaryY && j==1) dummy -= COEFF[i][j][k][2] * RLL[i][ny ][k]; dummy -= COEFF[i][j][k][3] * RLL[i][j+1][k]; if (PeriodicBoundaryZ && k==1) dummy -= COEFF[i][j][k][4] * RLL[i][j][nz ]; dummy -= COEFF[i][j][k][5] * RLL[i][j][k+1]; RLL[i][j][k] = dummy / h_sparse_s[i][j][k]; } // loop 5 #pragma omp for reduction(+:riri_sparse2) for (i=1; i<=nx; i++) for (j=1; j<=ny; j++) for (k=1; k<=nz; k++) { riri_sparse2 += res_sparse_s[i][j][k] * RLL[i][j][k]; } // loop 6 #pragma omp for for (i=1; i<=nx; i++) for (j=1; j<=ny; j++) for (k=1; k<=nz; k++) { p_sparse_s[i][j][k] = RLL[i][j][k] + (riri_sparse2 / noemer_sparse) * p_sparse_s[i][j][k]; } } noemer_sparse = riri_sparse2; riri_sparse = riri_sparse2; return; }