config_thread_limit.f90

!*****************************************************************************
! Copyright(C) 2012-2013 Intel Corporation. All Rights Reserved.
!
! The source code, information  and  material ("Material") contained herein is
! owned  by Intel Corporation or its suppliers or licensors, and title to such
! Material remains  with Intel Corporation  or its suppliers or licensors. The
! Material  contains proprietary information  of  Intel or  its  suppliers and
! licensors. The  Material is protected by worldwide copyright laws and treaty
! provisions. No  part  of  the  Material  may  be  used,  copied, reproduced,
! modified, published, uploaded, posted, transmitted, distributed or disclosed
! in any way  without Intel's  prior  express written  permission. No  license
! under  any patent, copyright  or  other intellectual property rights  in the
! Material  is  granted  to  or  conferred  upon  you,  either  expressly,  by
! implication, inducement,  estoppel or  otherwise.  Any  license  under  such
! intellectual  property  rights must  be express  and  approved  by  Intel in
! writing.
!
! *Third Party trademarks are the property of their respective owners.
!
! Unless otherwise  agreed  by Intel  in writing, you may not remove  or alter
! this  notice or  any other notice embedded  in Materials by Intel or Intel's
! suppliers or licensors in any way.
!
!*****************************************************************************
! Content:
! An example of using DFTI_THREAD_LIMIT configuration parameter.
! The parameter specifies maximum number of OpenMP threads FFT can use.
!
! Values:
!   0 (default) = use number of threads specified by
!                 mkl_[domain_]set_num_threads()
!   Any positive integer N = use not more than N threads
!
!*****************************************************************************
program config_thread_limit

  use omp_lib

  ! Need double precision
  integer, parameter :: WP = selected_real_kind(15,307)

  ! Number of parallel user threads
  integer, parameter :: NUT = 2

  integer :: failed = 0
  integer :: thr_failed, me, team

  print *,"Example config_thread_limit"

  ! Enable nested parallel OpenMP sections (maybe oversubscribed)
  ! Use kind(4) literals because some compilers fail on this in i8 mode
  call omp_set_nested(.true._4)
  call omp_set_dynamic(.false._4)

  ! Enable threading of MKL called from OpenMP parallel sections
  call mkl_set_dynamic(0)

  print '(" Run parallel FFTs on "I0" parallel threads")', NUT
!$omp parallel num_threads(NUT) default(shared) private(thr_failed, me)

  me = omp_get_thread_num()
  team = omp_get_num_threads()
  if (me == 0) then
    print '(" Thread "I0": parallel team is "I0" threads")', me, team
  endif

  if (me == 0) then
    thr_failed = run_dft(me, 2, 100,200,300, -1,-2,-3)
  else
    thr_failed = run_dft(me, 3, 200,300,100, -1,-2,-3)
  endif
  if (0 /= thr_failed) failed = thr_failed

!$omp end parallel

  if (failed == 0) then
    print *,"TEST PASSED"
    call exit(0)
  else
    print *,"TEST FAILED"
    call exit(1)
  endif

contains

  integer function run_dft(tid,tlimit,n1,n2,n3,h1,h2,h3)

    use MKL_DFTI, forget => DFTI_DOUBLE, DFTI_DOUBLE => DFTI_DOUBLE_R

    integer :: tid              ! Id of this thread
    integer :: tlimit           ! Thread limit
    integer :: N1,N2,N3         ! Sizes of 3D transform
    integer :: H1,H2,H3         ! Arbitrary harmonic used to test the FFT


    ! Execution status
    integer :: status = 0, ignored_status

    ! DFTI descriptor handle
    type(DFTI_DESCRIPTOR), POINTER :: hand

    ! Data array
    complex(WP), allocatable :: x (:,:,:)

    ! Temporary variable
    integer :: tl

    print '(" Thread "I0": Create DFTI descriptor for "I0"x"I0"x"I0" FFT")', &
      & tid, N1, N2, N3
    status = DftiCreateDescriptor(hand, DFTI_DOUBLE, DFTI_COMPLEX, 3, [N1,N2,N3])
    if (0 /= status) goto 999

    print '(" Thread "I0": Set number of user threads "I0)', tid, tlimit
    status = DftiSetValue(hand, DFTI_THREAD_LIMIT, tlimit)
    if (0 /= status) goto 999

    ! If tlimit > 1 check if we linked with sequential MKL
    if (tlimit > 1) then
      ! Get thread limit of uncommitted descriptor
      status = DftiGetValue(hand, DFTI_THREAD_LIMIT, tl)
      if (0 /= status) goto 999
      print '(" Thread "I0": uncommitted descriptor thread limit "I0)', tid, tl
    endif

    print '(" Thread "I0": Commit DFTI descriptor")', tid
    status = DftiCommitDescriptor(hand)
    if (0 /= status) goto 999

    ! Get thread limit of committed descriptor
    status = DftiGetValue(hand, DFTI_THREAD_LIMIT, tl)
    if (0 /= status) goto 999
    print '(" Thread "I0": committed descriptor thread limit "I0)', tid, tl

    print '(" Thread "I0": Allocate data array")', tid
    allocate (x(N1, N2, N3))

    print '(" Thread "I0": Initialize input for forward transform")', tid
    call init(x, N1, N2, N3, H1, H2, H3)

    print '(" Thread "I0": Compute forward transform")', tid
    status = DftiComputeForward(hand, x(:, 1, 1))
    if (0 /= status) goto 999

    print '(" Thread "I0": Verify the result")', tid
    status = verify(x, N1, N2, N3, H1, H2, H3)
    if (0 /= status) goto 999

100 continue

    print '(" Thread "I0": Release the DFTI descriptor")', tid
    ignored_status = DftiFreeDescriptor(hand)

    if (allocated(x)) then
      print '(" Thread "I0": Deallocate data array")', tid
      deallocate(x)
    endif

    if (status == 0) then
      print '(" Thread "I0": Subtest Passed")', tid
    else
      print '(" Thread "I0": Subtest Failed")', tid
    endif

    run_dft = status
    return

999 print '(" Thread "I0":  Error, status = ",I0)', tid, status
    goto 100

  end function run_dft

  ! Compute mod(K*L,M) accurately
  pure real(WP) function moda(k,l,m)
    integer, intent(in) :: k,l,m
    integer*8 :: k8
    k8 = k
    moda = real(mod(k8*l,m),WP)
  end function moda

  ! Initialize arrays with harmonic /H1, H2, H3/
  subroutine init(x, N1, N2, N3, H1, H2, H3)
    integer N1, N2, N3, H1, H2, H3
    complex(WP) :: x(:,:,:)

    integer k1, k2, k3
    complex(WP), parameter :: I_TWOPI = (0,6.2831853071795864769_WP)

    forall (k1=1:N1, k2=1:N2, k3=1:N3)
      x(k1,k2,k3) = exp( I_TWOPI * ( &
        moda(  k1-1,H1, N1)/N1 &
        + moda(k2-1,H2, N2)/N2 &
        + moda(k3-1,H3, N3)/N3 )) / (N1*N2*N3)
    end forall
  end subroutine init

  ! Verify that x(:,:,:) are unit peaks at /H1, H2, H3/
  integer function verify(x, N1, N2, N3, H1, H2, H3)
    integer N1, N2, N3, H1, H2, H3
    complex(WP) :: x(:,:,:)

    integer k1, k2, k3
    real(WP) err, errthr, maxerr
    complex(WP) res_exp, res_got

    ! Note, this simple error bound doesn't take into account error of
    ! input data
    errthr = 5.0 * log(real(N1*N2*N3,WP)) / log(2.0_WP) * EPSILON(1.0_WP)
    print '("  Check if err is below errthr " G10.3)', errthr

    maxerr = 0
    do k3 = 1, N3
      do k2 = 1, N2
        do k1 = 1, N1
          if (mod(k1-1-H1, N1)==0 .AND.                                &
            mod  (k2-1-H2, N2)==0 .AND.                                &
            mod  (k3-1-H3, N3)==0) then
            res_exp = 1
          else
            res_exp = 0
          end if
          res_got = x(k1,k2,k3)
          err = abs(res_got - res_exp)
          maxerr = max(err,maxerr)
          if (.not.(err <= errthr)) then
            print '("  x("I0","I0","I0"):"$)', k1, k2, k3
            print '(" expected ("G24.17","G24.17"),"$)', res_exp
            print '(" got ("G24.17","G24.17"),"$)', res_got
            print '(" err "G10.3)', err
            print *," Verification FAILED"
            verify = 100
            return
          end if
        end do
      end do
    end do
    print '("  Verified,  maximum error was " G10.3)', maxerr
    verify = 0
  end function verify

end program config_thread_limit
For more complete information about compiler optimizations, see our Optimization Notice.