integer*4, parameter :: l = 1000000 ! 100000 ! 1000000 integer*4, parameter :: m = 100 ! 10 ! 100 integer*4, parameter :: n = 10 ! 1000 ! 10 ! real*8 a(l,m), at(m,l), b(m,n), c(l,n), chk(l,n) real*4 t(2,0:17) real*8 ts(2), te(2) common /aaaa/ a, b, c, chk equivalence (a,at) integer*4 i,j ! call time_step (ts) do j = 1,m do i = 1,l call random_number (a(i,j)) ! a = 1 end do end do do j = 1,n do i = 1,m call random_number (b(i,j)) ! b = 1 end do end do c = 0 chk = 0 call time_step (te) t(:,0) = te - ts write (*,*) t(:,0),' initialise variables ' write (*,*) ' l = ',l write (*,*) ' m = ',m write (*,*) ' n = ',n write (*,*) ' maxval (a) =', maxval(a) write (*,*) ' maxval (b) =', maxval(b) call matmul_test (a,b,c, chk, l,m,n, t(:,1:7)) ! call matmul_tran (at,b,c, chk, l,m,n, t(:,10:17)) ! ! do i = 0,9 ! write (*,*) i,t(:,i) ! /cpu ! end do ! end subroutine matmul_test (A,B,C, chk, l,m,n, t) ! integer*4 l,m,n, i,j,k real*8 A(l,m), B(m,n), C(l,n), chk(l,n), row(m), s real*4 t(2,0:6), err_max real*8 ts(2), te(2) external err_max ! ! A(1000000,100), 800 mb ! B(100,10), 8 kb ! C(1000000,10) 80 mb ! ! 0) Classic loops call time_step (ts) C = 0 do i = 1,l do j = 1,n do k = 1,m C(i,j) = C(i,j) + A(i,k) * B(k,j) end do end do end do call time_step (te) ; t(:,0) = te - ts ! write (*,*) ' maxval (c) =', maxval(c) write (*,*) ' ' write (*,*) ' CPU Elapse Description Error' ! chk = c ! do i = 1,l ! do j = 1,n ! s = max ( s, abs(Chk(i,j))) ! end do ! end do write (*,*) t(:,0), '0) classic tripple loops ', err_max (c, chk, l,n) ! ! 1) Matmul Intrinsic C = 0 call time_step (ts) ! C = MatMul ( A, B ) ! call time_step (te) ; t(:,1) = te - ts write (*,*) t(:,1), '1) Matmul Intrinsic ', err_max (c, chk, l,n) ! ! 2) Conventional Matrix Multiplication, using accumulator "s" C = 0 call time_step (ts) do i = 1,l do j = 1,n s = 0 do k = 1,m s = s + A(i,k) * B(k,j) end do C(i,j) = s end do end do call time_step (te) ; t(:,2) = te - ts write (*,*) t(:,2), '2) conventional loops ', err_max (c, chk, l,n) ! ! 3) If s is not used - more general ( loop order swap for addressing A sequentially ) ! ( no need to transpose B ) call time_step (ts) C = 0 do k = 1,m do j = 1,n ! do i = 1,l ! C(i,j) = C(i,j) + A(i,k) * B(k,j) ! end do C(:,j) = C(:,j) + A(:,k) * B(k,j) end do end do call time_step (te) ; t(:,3) = te - ts write (*,*) t(:,3), '3) vector addition ', err_max (c, chk, l,n) ! ! 4) If row of A is used - sequential dot product, suits vector instruction ! C = 0 call time_step (ts) do i = 1,l row(1:m) = A(i,1:m) do j = 1,n ! do k = 1,m ! s = s + A(i,k) * B(k,j) ! end do C(i,j) = Dot_Product (row, B(:,j)) end do end do call time_step (te) ; t(:,4) = te - ts write (*,*) t(:,4), '4) row dot_product ', err_max (c, chk, l,n) ! ! 5) If B is transposed - vector addition ( no advantage with B' ??) ! call time_step (ts) C = 0 do k = 1,m do j = 1,n ! do i = 1,l ! C(:,j) = C(:,j) + A(:,k) * B(k,j) ! * B(j,k) ! end do call Vec_Add (C(1,j), A(1,k), l, B(k,j)) end do end do call time_step (te) ; t(:,5) = te - ts write (*,*) t(:,5), '5) vector addition call ', err_max (c, chk, l,n) ! ! 6) Row section - sequential dot product, suits vector instruction ! C = 0 call time_step (ts) do i = 1,l do j = 1,n C(i,j) = Dot_Product (A(i,:), B(:,j)) end do end do call time_step (te) ; t(:,6) = te - ts write (*,*) t(:,6), '6) dot_product section ', err_max (c, chk, l,n) end subroutine Vec_Add (C, A, l, b) integer*4 l real*8 C(l), A(l), b C = C + A * b end subroutine matmul_tran (A,B,C, chk, l,m,n, t) ! integer*4 l,m,n, i,j,k real*8 A(m,l), B(m,n), C(l,n), chk(l,n) real*4 t(2,0:7), err_max real*8 Vec_Sum, ts(2), te(2), s, lc(6) external Vec_Sum, err_max ! INTERFACE PURE real*8 function pure_Sum (U, V, n) integer*4, intent(IN) :: n real*8, intent(IN) :: U(n), V(n) end function pure_sum END INTERFACE ! ! A'(100,1000000), 800 mb ! B(100,10), 8 kb ! C(1000000,10) 80 mb ! write (*,*) ' ' ! ! 0) If A is transposed - Conventional Matrix Multiplication, using accumulator "s" C = 0 call time_step (ts) do i = 1,l do j = 1,n s = 0 do k = 1,m s = s + A(k,i) * B(k,j) end do C(i,j) = s end do end do call time_step (te) ; t(:,0) = te - ts chk = c write (*,*) t(:,0), 't0) transpose tripple loop ', err_max (c, chk, l,n) ! 1) If A is transposed - sequential dot product, suits vector instruction ! C = 0 call time_step (ts) do i = 1,l do j = 1,n C(i,j) = dot_product (A(:,i), B(:,j)) end do end do call time_step (te) ; t(:,1) = te - ts write (*,*) t(:,1), 't1) transpose dot_product ', err_max (c, chk, l,n) ! ! 2) use of for_all ! C = 0 call time_step (ts) forall (i=1:l, j=1:n) C(i,j) = dot_product (A(:,i), B(:,j)) end forall call time_step (te) ; t(:,2) = te - ts write (*,*) t(:,2), 't2) transpose FORALL ij ', err_max (c, chk, l,n) ! ! 3) alternative vector_sum ! C = 0 call time_step (ts) do i = 1,l do j = 1,n C(i,j) = Vec_Sum (A(1,i), B(1,j), m) end do end do call time_step (te) ; t(:,3) = te - ts write (*,*) t(:,3), 't3) transpose Vec_Sum ', err_max (c, chk, l,n) ! ! 4) Pure vector_sum ! C = 0 call time_step (ts) do i = 1,l do j = 1,n C(i,j) = Pure_Sum (A(1,i), B(1,j), m) end do end do call time_step (te) ; t(:,4) = te - ts write (*,*) t(:,4), 't4) transpose Pure_Sum ', err_max (c, chk, l,n) ! ! 5) sequential dot product; change loop order ! C = 0 call time_step (ts) do j = 1,n do i = 1,l C(i,j) = dot_product (A(:,i), B(:,j)) end do end do call time_step (te) ; t(:,5) = te - ts write (*,*) t(:,5), 't5) transpose d_p loop order ', err_max (c, chk, l,n) ! ! 6) use of for_all; change suggested loop order ! C = 0 call time_step (ts) forall (j=1:n, i=1:l) C(i,j) = dot_product (A(:,i), B(:,j)) end forall call time_step (te) ; t(:,6) = te - ts write (*,*) t(:,6), 't6) transpose FORALL ji ', err_max (c, chk, l,n) ! ! 7) DO Concurrent ! lc(1) = chk(1,1) lc(2) = chk(1,2) lc(3) = c(1,1) lc(4) = c(1,2) lc(5) = c(l,n) lc(6) = maxval (c) ! C = 0 call time_step (ts) ! do concurrent (i = 1:l, j=1:n) ! this failed ! ! this gives incorrect values and poor performance for C(i,j) ( chk is being corrupted !! ) ! do i = 1,l do j = 1,n do concurrent (i = 1:l) C(i,j) = dot_product (A(:,i), B(:,j)) end do end do call time_step (te) ; t(:,7) = te - ts write (*,*) t(:,7), 't7) DO CONCURRENT ', err_max (c, chk, l,n) ! write (*,*) ' ' write (*,*) ' DO CONCURRENT corrupts [chk] and does not update [C]' write (*,*) ' Before' write (*,*) ' chk(1,1)=', lc(1) write (*,*) ' chk(1,2)=', lc(2) write (*,*) ' c(1,1)=', lc(3) write (*,*) ' c(1,2)=', lc(4) write (*,*) ' c(l,n)=', lc(5) write (*,*) ' maxval (c) =', lc(6) ! write (*,*) ' After' write (*,*) ' chk(1,1)=', chk(1,1), real(lc(1) - chk(1,1)) write (*,*) ' chk(1,2)=', chk(1,2), real(lc(2) - chk(1,2)) write (*,*) ' c(1,1)=', c(1,1), real(lc(3) - c(1,1)) write (*,*) ' c(1,2)=', c(1,2), real(lc(4) - c(1,2)) write (*,*) ' c(l,n)=', c(l,n), real(lc(5) - c(l,n)) write (*,*) ' maxval (c) =', maxval(c) write (*,*) ' maxval (b) =', maxval(b) write (*,*) ' maxval (a) =', maxval(a) ! ! report changes to C !z call report_changes (c, l, n, 'c(l,n)') ! ! report changes to Chk !z call report_changes (chk, l, n, 'chk(l,n)') ! end real*8 function Vec_Sum (A, B, n) ! integer*4 n, i real*8 A(n), B(n), s ! s = 0 do i = 1,n s = s + a(i)*b(i) end do Vec_Sum = s end PURE real*8 function pure_Sum (A, B, n) ! integer*4, intent(IN) :: n real*8, intent(IN) :: A(n), B(n) ! integer*4 i real*8 s ! s = 0 do i = 1,n s = s + a(i)*b(i) end do pure_Sum = s end real*4 function err_max (c, chk, l, n) ! integer*4 l,n, i, j real*8 c(l,n), chk(l,n), x, xx ! xx = 0 do j = 1,n do i = 1,l x = abs(c(i,j)-chk(i,j)) if (x > xx) xx = x end do end do err_max = xx end subroutine time_step ( t ) ! real*8 t(2) integer*8 count, count_rate, count_max ! call cpu_time (t(1)) call SYSTEM_CLOCK (count, count_rate, count_max) t(2) = dble (count) / dble(count_rate) end subroutine report_changes (c, l, n, name) ! ! report changes to C ! integer*4 l,n, i,j, li,lj real*8 C(l,n), last, this character name*(*) ! write (*,*) ' ' write (*,*) 'Report changes to ', name, c(1,1) last = -1 li = -1 lj = -1 do j = 1,n do i = 1,l this = C(i,j) if (this /= last) then if (li/=-1) write (*,*) li,lj, last write (*,*) i,j, this end if last = this li = i lj = j end do end do write (*,*) li,lj, this ! end