integer*4, parameter :: l = 1000000 ! 100000 ! 1000000 integer*4, parameter :: m = 100 ! 10 ! 100 integer*4, parameter :: n = 10 ! 1000 ! 10 ! real*8 a(l,m), at(m,l), b(m,n), c(l,n), chk(l,n) real*4 t(2,0:15) real*8 ts(2), te(2) common /aaaa/ a, b, c, chk equivalence (a,at) integer*4 i ! call time_step (ts) a = 1 b = 1 call time_step (te) t(:,0) = te - ts write (*,*) t(:,0),' initialise variables ',l,m,n write (*,*) ' ' write (*,*) ' CPU Elapse Description Error' call matmul_test (a,b,c, chk, l,m,n, t(:,1:6)) ! call matmul_tran (at,b,c, chk, l,m,n, t(:,11:15)) ! ! do i = 0,9 ! write (*,*) i,t(:,i) ! /cpu ! end do ! end subroutine matmul_test (A,B,C, chk, l,m,n, t) ! integer*4 l,m,n, i,j,k real*8 A(l,m), B(m,n), C(l,n), chk(l,n), row(m), s real*4 t(2,0:5), err_max real*8 ts(2), te(2) external err_max ! ! A(1000000,100), 800 mb ! B(100,10), 8 kb ! C(1000000,10) 80 mb ! ! 0) Matmul Intrinsic call time_step (ts) ! Chk = MatMul ( A, B ) ! call time_step (te) ; t(:,0) = te - ts ! s = 0 do j = 1,n do i = 1,l s = max ( s, abs(Chk(i,j))) end do end do ! write (*,*) t(:,0), ' Matmul Intrinsic ', real(s) ! ! 1) Classic loops call time_step (ts) C = 0 do i = 1,l do j = 1,n do k = 1,m C(i,j) = C(i,j) + A(i,k) * B(k,j) end do end do end do call time_step (te) ; t(:,1) = te - ts write (*,*) t(:,1), ' classic loops ', err_max (c, chk, l,n) ! ! 2) Conventional Matrix Multiplication, using accumulator "s" call time_step (ts) do i = 1,l do j = 1,n s = 0 do k = 1,m s = s + A(i,k) * B(k,j) end do C(i,j) = s end do end do call time_step (te) ; t(:,2) = te - ts write (*,*) t(:,2), ' conventional loops ', err_max (c, chk, l,n) ! ! 3) If s is not used - more general ( loop order swap for addressing A sequentially ) ! ( no need to transpose B ) call time_step (ts) C = 0 do k = 1,m do j = 1,n ! do i = 1,l ! C(i,j) = C(i,j) + A(i,k) * B(k,j) ! end do C(:,j) = C(:,j) + A(:,k) * B(k,j) end do end do call time_step (te) ; t(:,3) = te - ts write (*,*) t(:,3), ' vector addition ', err_max (c, chk, l,n) ! ! 4) If row of A is used - sequential dot product, suits vector instruction ! call time_step (ts) do i = 1,l row(1:m) = A(i,1:m) do j = 1,n ! do k = 1,m ! s = s + A(i,k) * B(k,j) ! end do C(i,j) = Dot_Product (row, B(:,j)) end do end do call time_step (te) ; t(:,4) = te - ts write (*,*) t(:,4), ' row dot_product ', err_max (c, chk, l,n) ! ! 5) If B is transposed - vector addition ( no advantage with B' ??) ! call time_step (ts) C = 0 do k = 1,m do j = 1,n ! do i = 1,l ! C(:,j) = C(:,j) + A(:,k) * B(k,j) ! * B(j,k) ! end do call Vec_Add (C(1,j), A(1,k), l, B(k,j)) end do end do call time_step (te) ; t(:,5) = te - ts write (*,*) t(:,5), ' vector addition call ', err_max (c, chk, l,n) ! end subroutine Vec_Add (C, A, l, b) integer*4 l real*8 C(l), A(l), b C = C + A * b end subroutine matmul_tran (A,B,C, chk, l,m,n, t) ! integer*4 l,m,n, i,j, li,lj real*8 A(m,l), B(m,n), C(l,n), chk(l,n) real*4 t(2,5), err_max real*8 Vec_Sum, ts(2), te(2), last, this external Vec_Sum, err_max ! INTERFACE PURE real*8 function pure_Sum (U, V, n) integer*4, intent(IN) :: n real*8, intent(IN) :: U(n), V(n) end function pure_sum END INTERFACE ! ! A'(100,1000000), 800 mb ! B(100,10), 8 kb ! C(1000000,10) 80 mb ! ! 1) If A is transposed - sequential dot product, suits vector instruction ! call time_step (ts) do i = 1,l do j = 1,n C(i,j) = dot_product (A(:,i), B(:,j)) end do end do call time_step (te) ; t(:,1) = te - ts write (*,*) t(:,1), ' transpose dot_product ', err_max (c, chk, l,n) ! ! 2) use of for_all ! call time_step (ts) forall (i=1:l, j=1:n) C(i,j) = dot_product (A(:,i), B(:,j)) end forall call time_step (te) ; t(:,2) = te - ts write (*,*) t(:,2), ' transpose FOR ALL ', err_max (c, chk, l,n) ! ! ! 3) alternative vector_sum ! call time_step (ts) do i = 1,l do j = 1,n C(i,j) = Vec_Sum (A(1,i), B(1,j), m) end do end do call time_step (te) ; t(:,3) = te - ts write (*,*) t(:,3), ' transpose Vec_Sum ', err_max (c, chk, l,n) ! ! 4) Pure vector_sum ! call time_step (ts) do i = 1,l do j = 1,n C(i,j) = Pure_Sum (A(1,i), B(1,j), m) end do end do call time_step (te) ; t(:,4) = te - ts write (*,*) t(:,4), ' transpose Pure_Sum ', err_max (c, chk, l,n) ! write (*,*) ' ', chk(l,n) ! ! 5) DO Concurrent ! call time_step (ts) ! do concurrent (i = 1:l, j=1:n) ! this failed ! ! this gives incorrect values and poor performance for C(i,j) ( chk is being corrupted !! ) ! do i = 1,l do j = 1,n do concurrent (i = 1:l) C(i,j) = dot_product (A(:,i), B(:,j)) end do end do call time_step (te) ; t(:,5) = te - ts write (*,*) t(:,5), ' DO CONCURRENT ', err_max (c, chk, l,n), chk(1,1) ! ! report changes to C write (*,*) ' ' write (*,*) 'Report changes to C(l,n)', c(1,1) last = -1 li = -1 lj = -1 do j = 1,n do i = 1,l this = C(i,j) if (this /= last) then if (li/=-1) write (*,*) li,lj, last write (*,*) i,j, this end if last = this li = i lj = j end do end do write (*,*) li,lj, this, chk(l,n) ! ! report changes to C write (*,*) ' ' write (*,*) 'Report changes to chk(l,n)', chk(1,1) last = -1 li = -1 lj = -1 do j = 1,n do i = 1,l this = chk(i,j) if (this /= last) then if (li/=-1) write (*,*) li,lj, last write (*,*) i,j, this end if last = this li = i lj = j end do end do write (*,*) li,lj, this, chk(l,n) ! end real*8 function Vec_Sum (A, B, n) integer*4 n, i real*8 A(n), B(n), s s = 0 do i = 1,n s = s + a(i)*b(i) end do Vec_Sum = s end PURE real*8 function pure_Sum (A, B, n) integer*4, intent(IN) :: n real*8, intent(IN) :: A(n), B(n) ! integer*4 i real*8 s ! s = 0 do i = 1,n s = s + a(i)*b(i) end do pure_Sum = s end real*4 function err_max (c, chk, l, n) integer*4 l,n, i, j real*8 c(l,n), chk(l,n), x, xx ! xx = 0 do j = 1,n do i = 1,l x = abs(c(i,j)-chk(i,j)) if (x > xx) xx = x end do end do err_max = xx end subroutine time_step ( t ) real*8 t(2) integer*8 count, count_rate, count_max ! call cpu_time (t(1)) call SYSTEM_CLOCK (count, count_rate, count_max) t(2) = dble (count) / dble(count_rate) end