opemnp/mpi code for Matrix multiplication

opemnp/mpi code for Matrix multiplication

Hi ,
I have this matrix multiplication code that works fine on MPI and I am trying to add Openmp directives to this code to make use of my quad-core resources, can someone please tell me where and what to add to this code to make it work for hybrid openmp/mpi. Can someone help please!

#include

#include

#include

#include

typedef struct GridInfo {

int p; /* Total number of processes */

MPI_Comm comm; /* Communicator for entire grid */

MPI_Comm row_comm; /* Communicator for my row */

MPI_Comm col_comm; /* Communicator for my col */

int q; /* Order of grid */

int my_row; /* My row number */

int my_col; /* My column number */

int my_rank; /* My rank in the grid comm */

} GRID_INFO_T;

#define MAX 65536

#define RandVal rand()%10 // Value which filled the matrix

typedef struct LocalMatrix {

int n_bar;

#define Order(A) ((A)->n_bar)

float entries[MAX];

#define Entry(A,i,j) (*(((A)->entries) + ((A)->n_bar)*(i) + (j))) // Adress arithmetics

} LOCAL_MATRIX_T;

/* Function Declarations */

LOCAL_MATRIX_T* Local_matrix_allocate(int n_bar); // Memory for matrix

void Free_local_matrix(LOCAL_MATRIX_T** local_A); // Free memory

void Generate_matrix(char* title, LOCAL_MATRIX_T* local_A, // Fills the matrix

GRID_INFO_T* grid, int n);

void Print_matrix(char* title, LOCAL_MATRIX_T* local_A, // Prints the matrix on stdout

GRID_INFO_T* grid, int n);

void Set_to_zero(LOCAL_MATRIX_T* local_A); // Fills the matrix with zeroes

void Local_matrix_multiply(LOCAL_MATRIX_T* local_A, // Local matrix multiply

LOCAL_MATRIX_T* local_B, LOCAL_MATRIX_T* local_C);

void Build_matrix_type(LOCAL_MATRIX_T* local_A); // Creation of new MPI type for matrix

MPI_Datatype local_matrix_mpi_t; // New MPI type for matrix

double Sequential_time(int n);

LOCAL_MATRIX_T* temp_mat;

/*********************************************************/

int main(int argc, char* argv[]) {

int p; // Number of processes

int my_rank; // Rank of process

GRID_INFO_T grid; // Grid info of process

LOCAL_MATRIX_T* local_A;

LOCAL_MATRIX_T* local_B;

LOCAL_MATRIX_T* local_C;

int n; // Order of matrix

int n_bar; // Order of submatrix

double begin_time, end_time, interval_p, interval_s;

void Setup_grid(GRID_INFO_T* grid); // Setup the grid info of process

void Fox(int n, GRID_INFO_T* grid, LOCAL_MATRIX_T* local_A, // Fox`s algorithm

LOCAL_MATRIX_T* local_B, LOCAL_MATRIX_T* local_C);

MPI_Init(&argc, &argv);

MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);

MPI_Comm_size(MPI_COMM_WORLD, &p);

Setup_grid(&grid); // Setup the grid info of process

if (my_rank == 0) {

printf("Enter the matrices order: ");

fflush(stdout);

scanf("%d", &n);

}

//Sending size to all processes

MPI_Bcast(&n, 1, MPI_INT, 0, MPI_COMM_WORLD);

n_bar = n/grid.q; // Size of submatrix

local_A = Local_matrix_allocate(n_bar); // Memory for submatrix A in each process

Order(local_A) = n_bar;

Generate_matrix("\\nGenerate A", local_A, &grid, n); // Only P(0) do this

Print_matrix("Matrix A =", local_A, &grid, n); // Only P(0) do this

local_B = Local_matrix_allocate(n_bar); // Memory for submatrix B in each process

Order(local_B) = n_bar;

Generate_matrix("\\nGenerate B", local_B, &grid, n); // Only P(0) do this

Print_matrix("Matrix B =", local_B, &grid, n); // Only P(0) do this

Build_matrix_type(local_A); // Struct type for submatrix (MPI)

temp_mat = Local_matrix_allocate(n_bar); // Memory for Temp matrix

local_C = Local_matrix_allocate(n_bar); // Memory for result matrix C

Order(local_C) = n_bar; // Size of C

begin_time = MPI_Wtime();

Fox(n, &grid, local_A, local_B, local_C); // Block-algoritm of FOX

end_time = MPI_Wtime();

Print_matrix("\\nThe product is:", local_C, &grid, n);

interval_p = end_time - begin_time;

if (my_rank == 0) {

/* Sequential matrix multiplication */

interval_s = Sequential_time(n);

/* Speed-up */

printf("\\nTp = %.10f", interval_p);

printf("\\nTs = %.10f", interval_s);

printf("\\nS = %.10f\\n\\n", interval_s/interval_p);

}

Free_local_matrix(&local_A);

Free_local_matrix(&local_B);

Free_local_matrix(&local_C);

return MPI_Finalize();

} /* main */

/*********************************************************/

void Setup_grid(

GRID_INFO_T* grid /* out */) {

int old_rank;

int dimensions[2];

int wrap_around[2];

int coordinates[2];

int free_coords[2];

/* Set up Global Grid Information */

MPI_Comm_size(MPI_COMM_WORLD, &(grid->p));

MPI_Comm_rank(MPI_COMM_WORLD, &old_rank);

/* We assume p is a perfect square */

grid->q = (int) sqrt((double) grid->p);

dimensions[0] = dimensions[1] = grid->q;

/* We want a circular shift in second dimension. */

/* Don't care about first */

wrap_around[0] = wrap_around[1] = 1;

MPI_Cart_create(MPI_COMM_WORLD, 2, dimensions, wrap_around, 1, &(grid->comm));

MPI_Comm_rank(grid->comm, &(grid->my_rank));

MPI_Cart_coords(grid->comm, grid->my_rank, 2, coordinates);

grid->my_row = coordinates[0];

grid->my_col = coordinates[1];

/* Set up row communicators */

free_coords[0] = 0;

free_coords[1] = 1;

MPI_Cart_sub(grid->comm, free_coords, &(grid->row_comm));

/* Set up column communicators */

free_coords[0] = 1;

free_coords[1] = 0;

MPI_Cart_sub(grid->comm, free_coords, &(grid->col_comm));

} /* Setup_grid */

/*********************************************************/

void Fox(

int n /* in */,

GRID_INFO_T* grid /* in */,

LOCAL_MATRIX_T* local_A /* in */,

LOCAL_MATRIX_T* local_B /* in */,

LOCAL_MATRIX_T* local_C /* out */) {

LOCAL_MATRIX_T* temp_A; /* Storage for the sub- */

/* matrix of A used during */

/* the current stage */

int stage;

int bcast_root;

int n_bar; /* n/sqrt(p) */

int source;

int dest;

MPI_Status status;

n_bar = n/grid->q;

Set_to_zero(local_C);

/* Calculate addresses for circular shift of B */

source = (grid->my_row + 1) % grid->q; // (l + 1) mod q

dest = (grid->my_row + grid->q - 1) % grid->q; // (l + q - 1) mod q

/* Set aside storage for the broadcast block of A */

temp_A = Local_matrix_allocate(n_bar);

for (stage = 0; stage < grid->q; stage++) {

bcast_root = (grid->my_row + stage) % grid->q; // (l + stage) mod q --> (n)

if (bcast_root == grid->my_col) { // if (n == k)

MPI_Bcast(local_A, 1, local_matrix_mpi_t, // Broadcasting of A[l,k] (sub-block)

bcast_root, grid->row_comm);

Local_matrix_multiply(local_A, local_B, // C[l,k] += (A[l,n] * B[n,k])

local_C);

}

else {

MPI_Bcast(temp_A, 1, local_matrix_mpi_t,

bcast_root, grid->row_comm);

Local_matrix_multiply(temp_A, local_B,

local_C);

}

MPI_Sendrecv_replace(local_B, 1, local_matrix_mpi_t, //Sends and receives using a single buffer

dest, 0/* sendtag*/, source, 0/*recvtag*/, grid->col_comm, &status);

} /* for */

} /* Fox */

/*********************************************************/

LOCAL_MATRIX_T* Local_matrix_allocate(int local_order) {

LOCAL_MATRIX_T* temp;

temp = (LOCAL_MATRIX_T*) malloc(sizeof(LOCAL_MATRIX_T));

return temp;

} /* Local_matrix_allocate */

/*********************************************************/

void Free_local_matrix(

LOCAL_MATRIX_T** local_A_ptr /* in/out */) {

free(*local_A_ptr);

} /* Free_local_matrix */

/*********************************************************/

/* Read and distribute matrix:

* foreach global row of the matrix,

* foreach grid column

* read a block of n_bar floats on process 0

* and send them to the appropriate process.

*/

void Generate_matrix(

char* title /* in */,

LOCAL_MATRIX_T* local_A /* out */,

GRID_INFO_T* grid /* in */,

int n /* in */) {

int mat_row, mat_col;

int grid_row, grid_col;

int dest;

int coords[2];

float* temp;

MPI_Status status;

if (grid->my_rank == 0) {

//printf("%d\\n", grid->my_rank);

temp = (float*) malloc(Order(local_A)*sizeof(float));

printf("%s\\n", title);

fflush(stdout);

for (mat_row = 0; mat_row < n; mat_row++) {

grid_row = mat_row/Order(local_A);

coords[0] = grid_row;

for (grid_col = 0; grid_col < grid->q; grid_col++) {

coords[1] = grid_col;

MPI_Cart_rank(grid->comm, coords, &dest);

if (dest == 0) {

for (mat_col = 0; mat_col < Order(local_A); mat_col++)

//scanf("%f", (local_A->entries)+mat_row*Order(local_A)+mat_col);

*((local_A->entries)+mat_row*Order(local_A)+mat_col) = RandVal;

} else {

for(mat_col = 0; mat_col < Order(local_A); mat_col++)

//scanf("%f", temp + mat_col);

*(temp + mat_col) = RandVal;

MPI_Send(temp, Order(local_A), MPI_FLOAT, dest, 0,

grid->comm);

}

}

}

free(temp);

} else {

for (mat_row = 0; mat_row < Order(local_A); mat_row++)

MPI_Recv(&Entry(local_A, mat_row, 0), Order(local_A),

MPI_FLOAT, 0, 0, grid->comm, &status);

}

} /* Read_matrix */

/*********************************************************/

void Print_matrix(

char* title /* in */,

LOCAL_MATRIX_T* local_A /* out */,

GRID_INFO_T* grid /* in */,

int n /* in */) {

int mat_row, mat_col;

int grid_row, grid_col;

int source;

int coords[2];

float* temp;

MPI_Status status;

if (grid->my_rank == 0) {

temp = (float*) malloc(Order(local_A)*sizeof(float));

printf("%s\\n", title);

for (mat_row = 0; mat_row < n; mat_row++) {

grid_row = mat_row/Order(local_A);

coords[0] = grid_row;

for (grid_col = 0; grid_col < grid->q; grid_col++) {

coords[1] = grid_col;

MPI_Cart_rank(grid->comm, coords, &source);

if (source == 0) {

for(mat_col = 0; mat_col < Order(local_A); mat_col++)

printf("%6.1f ", Entry(local_A, mat_row, mat_col));

} else {

MPI_Recv(temp, Order(local_A), MPI_FLOAT, source, 0,

grid->comm, &status);

for(mat_col = 0; mat_col < Order(local_A); mat_col++)

printf("%6.1f ", temp[mat_col]);

}

}

printf("\\n");

}

free(temp);

} else {

for (mat_row = 0; mat_row < Order(local_A); mat_row++)

MPI_Send(&Entry(local_A, mat_row, 0), Order(local_A),

MPI_FLOAT, 0, 0, grid->comm);

}

} /* Print_matrix */

/*********************************************************/

void Set_to_zero(

LOCAL_MATRIX_T* local_A /* out */) {

int i, j;

for (i = 0; i < Order(local_A); i++)

for (j = 0; j < Order(local_A); j++)

Entry(local_A,i,j) = 0.0;

} /* Set_to_zero */

/*********************************************************/

void Build_matrix_type(

LOCAL_MATRIX_T* local_A /* in */) {

MPI_Datatype temp_mpi_t;

int block_lengths[2];

MPI_Aint displacements[2];

MPI_Datatype typelist[2];

MPI_Aint start_address;

MPI_Aint address;

MPI_Type_contiguous(Order(local_A)*Order(local_A),

MPI_FLOAT, &temp_mpi_t); //array[0..n-1,0..n-1] of float

block_lengths[0] = block_lengths[1] = 1;

typelist[0] = MPI_INT; //2 elements: 1) int, 2) array[...] of float

typelist[1] = temp_mpi_t;

MPI_Address(local_A, &start_address); //adress of submatrix - start

MPI_Address(&(local_A->n_bar), &address); //adress of Size of submatrix

displacements[0] = address - start_address; //between n_bar and beginning (sometimes <>0 )

MPI_Address(local_A->entries, &address); //adress of entries

displacements[1] = address - start_address; //between entries and beginning

MPI_Type_struct(2, block_lengths, displacements, //new struct type

typelist, &local_matrix_mpi_t);

MPI_Type_commit(&local_matrix_mpi_t); //return type?

} /* Build_matrix_type */

/*********************************************************/

void Local_matrix_multiply(

LOCAL_MATRIX_T* local_A /* in */,

LOCAL_MATRIX_T* local_B /* in */,

LOCAL_MATRIX_T* local_C /* out */) {

int i, j, k;

for (i = 0; i < Order(local_A); i++)

for (j = 0; j < Order(local_A); j++)

for (k = 0; k < Order(local_B); k++)

Entry(local_C,i,j) += Entry(local_A,i,k)*Entry(local_B,k,j);

} /* Local_matrix_multiply */

/*********************************************************/

double Sequential_time(int n) {

int i,j,k;

double begin_time, end_time;

float *A = (float *) malloc(n*n * sizeof(float));

float* B = (float *) malloc(n*n * sizeof(float));

float* C = (float *) malloc(n*n * sizeof(float));

for (i=0; i

A[i] = RandVal; B[i] = RandVal; C[i] = 0.0;

}

begin_time = MPI_Wtime();

for (i = 0; i < n; i++) {

for (j = 0; j < n; j++) {

for (k = 0; k < n; k++) {

C[i*n+j] += A[i*n+k]*B[k*n+j];

}

}

}

end_time = MPI_Wtime();

free(A); free(B); free(C);

return end_time - begin_time;

}

3 posts / 0 nouveau(x)
Dernière contribution
Reportez-vous à notre Notice d'optimisation pour plus d'informations sur les choix et l'optimisation des performances dans les produits logiciels Intel.

You'd better ask in this forum.
I'm not sure that people knowing openMP read this forum.

Regards!
Dmitry

Parallel Studio might be useful for optimizing a single multi-threaded rank for a Windows target. Basically, you would like to thread the outermost loop. You might get a better combination of vector inner loop and parallel outer loop performance by borrowing some ideas from public sources, particularly if this is a class exercise. If not, you would likely get ahead faster by using one of the pre-optimized versions (MKL is already included with Intel compilers).

Connectez-vous pour laisser un commentaire.