Visual Fortran 2011 and openMP are pretty new to me; I've been using C++ and C# for parallel programming on my system: Dell Studio XPS w/Intel i7 860 quad core running Windows 7-64 bit.

For a 1024x1024 (or larger) matrix multiplication test, I'm finding that a Fortran openMP routine runs considerably slower than the same sequential routine. The openMP routine appears to be properly using 8 threads when I look at Windows Task Manager during execution, and the results of the multiplication look OK.

Could someone please take a look at the following code and tell me if I'm using openMP correctly?

!****************************************************************************

!

! PROGRAM: openMP_speed_test

!

! PURPOSE: Test use of openMP

!

!****************************************************************************

subroutine mat_mult_trans(A,B,C,N)

!Use transpose of A in multiplication to make better use of cache

integer N

real*8 A(N,N),B(N,N),C(N,N),temp

!transpose A

do i=1,N-1

do j=i+1,N

temp=A(i,j)

A(i,j)=A(j,i)

A(j,i)=temp

end do

end do

!do the multiplication

do i=1,N

do j=1,N

temp=0.0D0

do k=1,N

temp=temp+A(k,i)*B(k,j)

end do

C(i,j)=temp

end do

end do

!restore A

do i=1,N-1

do j=i+1,N

temp=A(i,j)

A(i,j)=A(j,i)

A(j,i)=temp

end do

end do

end subroutine

subroutine par_mat_mult_trans(A,B,C,N)

!same as mat_mult_trans, but using openMP

integer N,nthreads,TID,chunk

Integer OMP_GET_NUM_THREADS, OMP_GET_THREAD_NUM

real*8 A(N,N),B(N,N),C(N,N),temp

chunk=32

!$OMP PARALLEL SHARED(A,B,C,NTHREADS,CHUNK)

TID = OMP_GET_THREAD_NUM()

IF (TID .EQ. 0) THEN

NTHREADS = OMP_GET_NUM_THREADS()

PRINT *, 'Starting matrix multiply with ', NTHREADS,' threads'

END IF

!$OMP DO SCHEDULE(DYNAMIC, CHUNK)

do i=1,N-1

do j=i+1,N

temp=A(i,j)

A(i,j)=A(j,i)

A(j,i)=temp

end do

end do

!$OMP DO SCHEDULE(DYNAMIC, CHUNK)

do i=1,N

do j=1,N

temp=0.0D0

do k=1,N

temp=temp+A(k,i)*B(k,j)

end do

C(i,j)=temp

end do

end do

!$OMP DO SCHEDULE(DYNAMIC, CHUNK)

do i=1,N-1

do j=i+1,N

temp=A(i,j)

A(i,j)=A(j,i)

A(j,i)=temp

end do

end do

!$omp end parallel

end subroutine

program openMP_speed_test

Integer N, max_threads

real(kind=8), allocatable :: A(:,:),B(:,:),C(:,:),D(:)

real(kind=8), allocatable :: c2(:,:)

real*8 t1,t2,num,diff

character(len=1) ans

ans='y'

do while((ans.eq.'y').or.(ans.eq.'Y'))

print *,"Matrix size?"

READ *,N

allocate(A(N,N),B(N,N),C(N,N),c2(N,N),D(N))

do i=1,N

do j=1,N

call random_number(num)

if (num<0.5D0) then

num=-1.0D0

else

num=1.0D0

end if

call random_number(A(i,j))

A(i,j)=num*A(i,j)

call random_number(num)

if (num<0.5D0) then

num=+1.0D0

else

num=-1.0D0

end if

call random_number(B(i,j))

B(i,j)=num*B(i,j)

C(i,j)=0.0D0

c2(i,j)=0.0D0

end do

end do

!run standard matrix multiplication using transpose(A) for

!better cache use

PRINT *,"Starting mat_mult_trans."

call cpu_time(t1)

call Mat_mult_trans(A,B,C,N)

call cpu_time(t2)

t2=t2-t1

print 120, t2

120 format("mat_mult_trans execution time=",f6.3," secs")

!Run Fortran matmul function

PRINT *,"Starting Fortran matmul."

call cpu_time(t1)

c2= matmul(A,B)

call cpu_time(t2)

t2=t2-t1

print 140, t2

140 format("canned matmul execution time=",f6.3," secs")

!Check results vs. 'standard'

diff=0.0D0

do i=1,N

do j=1,N

if (abs(C(i,j)-c2(i,j))>diff) then

diff=abs(C(i,j)-c2(i,j))

end if

end do

end do

print 121, diff

121 format("Maximum error=",e14.6)

!Run openMP version of 'standard' with transpose(A)

PRINT *,"Starting par_mat_mult_trans"

call cpu_time(t1)

call par_mat_mult_trans(A,B,c2,N)

call cpu_time(t2)

t2=t2-t1

print 131, t2

131 format("par_mat_mult_trans execution time=",f6.3," secs")

!Check results vs 'standard'

diff=0.0D0

do i=1,N

do j=1,N

if (abs(C(i,j)-c2(i,j))>diff) then

diff=abs(C(i,j)-c2(i,j))

end if

end do

end do

print 121, diff

!Check for more work

print *,"More?"

read *,ans

deallocate(A,B,C,c2,D)

end do

end program

!********************************************************************************

In addition to the speed question, times appear very variable. For three successive runs on 1024x1024 matrices, I get the following times (in secs.)

mat_mult_trans: 0.546 0.671 0.686

Fortran matmul: 0.234 0.234 0.234

par_mat_mult_trans: 0.983 1.310 1.420

TIA for any help,

Don Ritchie