Hi all,
I have a program in which I used the dsyevd routine from lapack to calculate a complete eigensystem, however since I already use MPI in my program, I wanted to use the pdsyev routine from scalapack.
To compile I used the gnu compiler version 4.3.2 and openmpi version 1.4.2 (compiled with gnu) and linked with mkl library version 10.2.5.035.
Everything worked as I expected and gave the correct results for my test-case.
However, I want to use the program on our cluster and because of some irrelevant issues I needed to compile with an intel compiler.
I installed the intel compiler suite and recompiled openmpi for use with ifort. After some small changes my program compiled without warning and my program runs without error... but the results of the calculations are incorrect!!
The results are only incorrect if I use more then 1 MPI process, so when I run 'mpirun -n 1 myprog' it gives the correct result.
I tried compiling with -unroll0 but this didn't make a difference.
Does anybody have an idea where something could go wrong?
the full line I used to compile is:
/home/bart/progs/openmpi-1.4.2-icc/bin/mpif90 main.f90 integration.f90 subroutines.f90 interpolation.f90 functions.f90 data.f90 -warn all -L /home/bart/intel/mkl/10.2.5.035/lib/em64t -lmkl_scalapack_lp64 -lmkl_lapack -lmkl_blacs_openmpi_lp64 -lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core -liomp5 -openmp -unroll0
and on the gnu compiler I used:
/home/bart/progs/openmpi-1.4.2-gcc/bin/mpif90 -o main -g main.f90 integration.f90 subroutines.f90 interpolation.f90 functions.f90 data.f90 -L /home/bart/intel/mkl/10.2.5.035/lib/em64t -lmkl_scalapack_lp64 -lmkl_lapack -lmkl_blacs_openmpi_lp64 -lmkl_intel_lp64 -lmkl_gnu_thread -lmkl_core -liomp5 -fopenmp
The piece of code where I use the scalapack routine is in a loop (that's why I tried unroll0) and looks like this:
do b1=1,n
do b2=1,n
call pdelset(lbigomega,b1,b2,desca,bigomega(b1,b2))
end do
end do
lwork = -1
deallocate(work)
allocate(work(1))
call pdsyev(jobz,uplo,n,lbigomega,1,1,desca,w,lz,1,1,descz,work,lwork,info)
lwork = int(abs(work(1)))
deallocate(work)
allocate(work(lwork))
call pdsyev(jobz,uplo,n,lbigomega,1,1,desca,w,lz,1,1,descz,work,lwork,info)
if (myid == 0) then
loca = numroc(n,nb,myrow,0,nprow)
locb = numroc(n,nb,mycol,0,npcol)
do node = 0, numprocs - 1
if (node==0) then
irow = myrow
icol = mycol
else
call MPI_Recv(irow,1,MPI_integer,node,10,MPI_COMM_WORLD,status0,ierr)
call MPI_Recv(icol,1,MPI_integer,node,20,MPI_COMM_WORLD,status0,ierr)
call MPI_Recv(loca,1,MPI_integer,node,40,MPI_COMM_WORLD,status0,ierr)
call MPI_Recv(locb,1,MPI_integer,node,50,MPI_COMM_WORLD,status0,ierr)
deallocate(lz)
allocate(lz(loca,locb))
call MPI_Recv(lz,loca*locb,MPI_REAL8,node,30,MPI_COMM_WORLD,status0,ierr)
end if
nbrows = ceiling(real(loca)/real(nb))
nbcolumns = ceiling(real(locb)/real(nb))
do ibcolumns = 1, nbcolumns
if (ibcolumns==nbcolumns) then
col_max = locb - (nbcolumns-1)*nb
else
col_max = nb
end if
do ibrows = 1, nbrows
if (ibcolumns==nbcolumns) then
row_max = loca - (nbrows-1)*nb
else
row_max = nb
end if
do columns = 1, col_max
do rows = 1, row_max
z((irow+(ibrows-1)*nprow)*nb + rows, (icol+(ibcolumns-1)*npcol)*nb + columns) &
& = lz((ibrows-1)*nb + rows, (ibcolumns-1)*nb + columns)
end do
end do
end do
end do
end do
else
call MPI_Send(myrow, 1, MPI_Integer, 0, 10, MPI_COMM_WORLD, ierr)
call MPI_Send(mycol, 1, MPI_Integer, 0, 20, MPI_COMM_WORLD, ierr)
call MPI_Send(lda, 1, MPI_Integer, 0, 40, MPI_COMM_WORLD, ierr)
call MPI_Send(ldb, 1, MPI_Integer, 0, 50, MPI_COMM_WORLD, ierr)
call MPI_Send(lz, lda*ldb, MPI_REAL8, 0, 30, MPI_COMM_WORLD, ierr)
end if
call MPI_BCAST (z(1,1), n*n, MPI_REAL8,0, MPI_COMM_WORLD,ierr)
bigomega = z
What bothers me the most is that it works with one compiler and not with the other...
Thank you,
Bart


