MKL ScaLAPACK + MVAPICH + 100 lines of code = CRASH

MKL ScaLAPACK + MVAPICH + 100 lines of code = CRASH

The following code has proved good to generate a crash using MKL 10.2 update 2 (sequential version and threaded), last revision of MVAPICH, in two different clusters. Can anybody tell me what the problem is here? It does not crash always, but it does crash when the right number of MPI processes and matrix sizes are selected.

A

/*

* crash.cpp - crashes with ICC 11.1, MKL 10.2, MVAPICH 1.0 on linux 64-bit

* both linked with the serial or threaded libraries

* doing mpirun -np 36 crash 5000 10

*/

#include

#include

#include

#include

#include"mpi.h"

#include "mkl_scalapack.h"

extern "C" {

/* BLACS C interface */

void Cblacs_get(int context,int request,int* value);

int Cblacs_gridinit(int* context,char * order,int np_row,int np_col);

void Cblacs_gridinfo(int context,int* np_row,int* np_col,int* my_row,int* my_col);

int numroc_(int *n,int *nb,int *iproc,int *isrcproc,int *nprocs);

/* PBLAS */

void pdgemm_( char *TRANSA,char *TRANSB,int * M,int * N,int * K,double * ALPHA,

double * A,int * IA,int * JA,int * DESCA,double * B,int * IB,int * JB,int * DESCB,

double * BETA,double * C,int * IC,int * JC,int * DESCC );

}

#define BLOCK_SIZE65

int main(int argc,char* argv[] )

{

int iam, nprocs;

MPI_Init(&argc,&argv); /* starts MPI */

MPI_Comm_rank(MPI_COMM_WORLD, &iam);

MPI_Comm_size(MPI_COMM_WORLD, &nprocs);

// get done with the ones that are not part of the grid

int blacs_pgrid_size =floor(sqrt(nprocs));

if (iam>=blacs_pgrid_size*blacs_pgrid_size) {

printf("Bye bye world from process %d of %d. BLACS had no place for me...\n",iam,nprocs);

MPI_Finalize();

}

// start BLACS with square processor grid

if(iam==0)

printf("starting BLACS...");

int ictxt,nprow,npcol,myrow,mycol;

Cblacs_get( -1,0, &ictxt );

Cblacs_gridinit( &ictxt,"C", blacs_pgrid_size, blacs_pgrid_size );

Cblacs_gridinfo( ictxt, &nprow, &npcol, &myrow, &mycol );

if(iam==0)

printf("done.\n");

double timing;

int m,n,k,lm,ln,nbm,nbn,rounds;

int myzero=0,myone=1;

sscanf(argv[1],"%d",&m);

n=m;

k=m;

sscanf(argv[2],"%d",&rounds);

nbm = BLOCK_SIZE;

nbn = BLOCK_SIZE;

lm =numroc_(&m, &nbm, &myrow, &myzero, &nprow);

ln =numroc_(&n, &nbn, &mycol, &myzero, &npcol);

int info;

int *ipiv = new int[lm+nbm+10000000]; //adding a "little" bit of extra space just in case

char ta ='N',tb ='T';

double alpha =1.0, beta =0.0;

double* test1data =new double[lm*ln];

double* test2data =new double[lm*ln];

double* test3data =new double[lm*ln];

for(int i=0;i

test1data[i]=(double)(rand()%100)/10000.0;

int *test1desc =new int[9];

int *test2desc =new int[9];

int *test3desc =new int[9];

test1desc[0] =1; // descriptor type

test1desc[1] = ictxt; // blacs context

test1desc[2] = m; // global number of rows

test1desc[3] = n; // global number of columns

test1desc[4] = nbm; // row block size

test1desc[5] = nbn; // column block size (DEFINED EQUAL THAN ROW BLOCK SIZE)

test1desc[6] = 0; // initial process row(DEFINED 0)

test1desc[7] = 0; // initial process column (DEFINED 0)

test1desc[8] = lm; // leading dimension of local array

memcpy(test2desc,test1desc,9*sizeof(int));

memcpy(test3desc,test1desc,9*sizeof(int));

for(int iter=0;iter

{

if(iam==0)

printf("iter %i - ",iter);

//test2 = test1

memcpy(test2data,test1data,lm*ln*sizeof(double));

//test3 = test1*test2

timing=MPI_Wtime();

pdgemm_(&ta,&tb,&m,&n,&k,

&alpha,

test1data,&myone,&myone,test1desc,

test2data,&myone,&myone, test2desc,

&beta,

test3data,&myone,&myone, test3desc);

if(iam==0)

printf(" PDGEMM = %f |",MPI_Wtime()-timing);

//test3 = LU(test3)

timing=MPI_Wtime();

pdgetrf_(&m, &n, test3data, &myone, &myone, test3desc, ipiv, &info);

if(iam==0)

printf(" PDGETRF = %f.\n",MPI_Wtime()-timing);

}

delete[] ipiv;

delete[] test1data, test2data, test3data;

delete[] test1desc, test2desc, test3desc;

MPI_Finalize();

return 0;

}

3 posts / 0 new
Last post
For more complete information about compiler optimizations, see our Optimization Notice.

Hello,

Just tested your example using gcc 4.3.2 and mkl 10.2, with openmpi 1.3.3. No segfaults.

Also tested with icc 11.1, mkl 10.2, and openmpi 1.3.3. No problems.

Leave a Comment

Please sign in to add a comment. Not a member? Join today