integer*4, parameter :: l = 1000000 integer*4, parameter :: m = 100 integer*4, parameter :: n = 10 ! real*8 a(l,m), at(m,l), b(m,n), c(l,n), chk(l,n) real*8 te(0:6), t common /aaaa/ a, b, c, chk equivalence (a,at) integer*4 i ! call cpu_time (te(0)) a = 1 b = 1 call cpu_time (t) te(0) = t-te(0) write (*,*) te(0),' initialise variables' call matmul_test (a,b,c, chk, l,m,n, te(1:5)) ! call matmul_tran (at,b,c, chk, l,m,n, te(6:6)) ! ! do i = 0,6 ! write (*,*) i,te(i) ! /cpu ! end do ! end subroutine matmul_test (A,B,C, chk, l,m,n, te) ! integer*4 l,m,n, i,j,k real*8 A(l,m), B(m,n), C(l,n), chk(l,n), row(m), s real*8 te(0:4), t, err_max external err_max ! ! A(1000000,100), 800 mb ! B(100,10), 8 kb ! C(1000000,10) 80 mb ! ! Matmul Intrinsic call cpu_time (te(0)) ! Chk = MatMul ( A, B ) ! call cpu_time (t) te(0) = t - te(0) ! s = 0 do j = 1,n do i = 1,l s = max ( s, abs(Chk(i,j))) end do end do ! write (*,*) te(0), ' Matmul Intrinsic ', s ! ! Conventional Matrix Multiplication, using accumulator "s" call cpu_time (te(1)) do i = 1,l do j = 1,n s = 0 do k = 1,m s = s + A(i,k) * B(k,j) end do C(i,j) = s end do end do call cpu_time (t) te(1) = t - te(1) write (*,*) te(1), ' conventional loops ', err_max (c, chk, l,n) ! ! If s is not used - more general ( loop order swap for addressing A sequentially ) ! ( no need to transpose B ) call cpu_time (te(2)) C = 0 do k = 1,m do j = 1,n ! do i = 1,l ! C(i,j) = C(i,j) + A(i,k) * B(k,j) ! end do C(:,j) = C(:,j) + A(:,k) * B(k,j) end do end do call cpu_time (t) te(2) = t - te(2) write (*,*) te(2), ' vector addition ', err_max (c, chk, l,n) ! ! If row of A is used - sequential dot product, suits vector instruction ! call cpu_time (te(3)) do i = 1,l row(1:m) = A(i,1:m) do j = 1,n ! do k = 1,m ! s = s + A(i,k) * B(k,j) ! end do C(i,j) = Dot_Product (row, B(:,j)) end do end do call cpu_time (t) te(3) = t - te(3) write (*,*) te(3), ' row dot_product ', err_max (c, chk, l,n) ! ! If B is transposed - vector addition ! call cpu_time (te(4)) C = 0 do k = 1,m do j = 1,n ! do i = 1,l C(:,j) = C(:,j) + A(:,k) * B(k,j) ! * B(j,k) ! end do end do end do call cpu_time (t) te(4) = t - te(4) write (*,*) te(4), ' vector addition ', err_max (c, chk, l,n) ! end subroutine matmul_tran (A,B,C, chk, l,m,n, te) ! integer*4 l,m,n, i,j real*8 A(m,l), B(m,n), C(l,n), chk(l,n) real*8 te(1), t, err_max external err_max ! ! A(1000000,100), 800 mb ! B(100,10), 8 kb ! C(1000000,10) 80 mb ! ! If A is transposed - sequential dot product, suits vector instruction ! call cpu_time (te(1)) C = 0 do i = 1,l do j = 1,n ! do k = 1,m ! C(i,j) = C(i,j) + A(k,i) * B(k,j) ! end do C(i,j) = Dot_Product (A(:,i), B(:,j)) end do end do call cpu_time (t) te(1) = t - te(1) write (*,*) te(1), ' transpose dot_product', err_max (c, chk, l,n) ! end real*8 function err_max (c, chk, l, n) integer*4 l,n, i, j real*8 c(l,n), chk(l,n), x, xx ! xx = 0 do j = 1,n do i = 1,l x = abs(c(i,j)-chk(i,j)) if (x > xx) xx = x end do end do err_max = xx end