OpenMP* SIMD for Inclusive/Exclusive Scans

ID 771076
Updated 6/3/2023
Version Latest
Public

author-image

By

With Intel® C++ Compiler 19.0 and newer and with Intel® Fortran Compiler 19.1 and newer including Intel® oneAPI DPC++/C++ Compiler and Intel® Fortran Compiler available in the oneAPI Toolkits, we have support for the SIMD implementation of the inclusive and exclusive scan. Starting with OpenMP* Version 5.0, the reduction clause supports scans patterns.

Explicit syntax for inclusive scan using C++

#pragma omp simd reduction[parallel](inscan, operator:list)
#pragma omp scan inclusive(item-list)

Explicit syntax for inclusive scan using Fortran

!$omp simd reduction(inscan, operator : list)
!$omp scan inclusive(item-list)

Explicit syntax for exclusive scan using C++

#pragma omp simd reduction[parallel](inscan, operator:list)
#pragma omp scan exclusive(item-list)

Explicit syntax for exclusive scan using Fortran

!$omp simd reduction(inscan, operator : list)
!$omp scan exclusive(item-list)

Below is a C++ code snippet which uses prefix sum with SIMD scan feature. The implementation offers serial version of the code, as well as, SIMD version of both Inclusive and Exclusive scans.

EXCLUSIVE SCAN

#include<iostream>
#include<math.h>
#include<string.h>
#include<chrono>
#define N 16
using namespace std;
int main(){
        int a[N], serial_scan[N], naive_scan[N], simd_scan[N], scan_a;
        for(int i = 0; i < N; i++){
                a[i] = i;
                serial_scan[i] = 0;
                simd_scan[i] = 0;
        }
        //Serial Scan with "+" operator
        auto start = std::chrono::system_clock::now();
        scan_a = 0;
        for(int i = 0; i < N; i++){
                serial_scan[i] = scan_a;
                scan_a += a[i];
        }
        auto stop = std::chrono::system_clock::now();
        std::cout<<"Serial Scan Output:\n";
        for(int i = 0; i < N; i++)
                std::cout<<serial_scan[i]<<"\t";
        std::cout<<"\n";
        std::chrono::duration<double> elapsed_seconds = stop-start;
        std::cout<<"Time taken in seconds is "<<elapsed_seconds.count()<<"\n";
        //SIMD Scan with "+" operator
        start = std::chrono::system_clock::now();
        scan_a = 0;
        #pragma omp simd reduction(inscan, +:scan_a)
        for(int i = 0; i < N; i++){
                simd_scan[i] = scan_a;
                #pragma omp scan exclusive(scan_a)
                scan_a += a[i];
        }
        stop = std::chrono::system_clock::now();
        std::cout<<"SIMD Scan Output:\n";
        for(int i = 0; i < N; i++)
                std::cout<<simd_scan[i]<<"\t";
        std::cout<<"\n";
        elapsed_seconds = stop - start;
        std::cout<<"Time taken in seconds is "<<elapsed_seconds.count()<<"\n";
        return 0;
}

INCLUSIVE SCAN

#include<iostream>
#include<math.h>
#include<string.h>
#include<chrono>
#define N 16
using namespace std;
int main(){
        int a[N], serial_scan[N], naive_scan[N], simd_scan[N], scan_a;
        for(int i = 0; i < N; i++){
                a[i] = i;
                serial_scan[i] = 0;
                simd_scan[i] = 0;
        }
        //Serial Scan with "+" operator
        auto start = std::chrono::system_clock::now();
        scan_a = 0;
        for(int i = 0; i < N; i++){
                scan_a += a[i];
                serial_scan[i] = scan_a;
        }
        auto stop = std::chrono::system_clock::now();
        std::cout<<"Serial Scan Output:\n";
        for(int i = 0; i < N; i++)
                std::cout<<serial_scan[i]<<"\t";
        std::cout<<"\n";
        std::chrono::duration<double> elapsed_seconds = stop-start;
        std::cout<<"Time taken in seconds is "<<elapsed_seconds.count()<<"\n";
        //SIMD Scan with "+" operator
        start = std::chrono::system_clock::now();
        scan_a = 0;
        #pragma omp simd reduction(inscan, +:scan_a)
        for(int i = 0; i < N; i++){
                scan_a += a[i];
                #pragma omp scan inclusive(scan_a)
                simd_scan[i] = scan_a;
        }
        stop = std::chrono::system_clock::now();
        std::cout<<"SIMD Scan Output:\n";
        for(int i = 0; i < N; i++)
                std::cout<<simd_scan[i]<<"\t";
        std::cout<<"\n";
        elapsed_seconds = stop - start;
        std::cout<<"Time taken in seconds is "<<elapsed_seconds.count()<<"\n";
        return 0;
}

Here is the Fortran version of the same code snippet.

EXCLUSIVE SCAN

program exclusive
  implicit none
  integer, parameter   :: n = 16
  integer, parameter   :: ntimes = 10000000  ! iterate enough times to accumulate some CPU time
  integer              :: a(N), serial_scan(N), simd_scan(N), scn_a
  integer              :: i, j
  real(8)              :: start, stop
  !initialize
  do i = 1, n
     a(i) = i-1
  end do
  serial_scan = 0
  simd_scan = 0
  !Serial Scan with "+" operator
  call cpu_time(start)
  do j = 1, ntimes
     scn_a = 0
     do i = 1, n
        serial_scan(i) = scn_a
        scn_a = scn_a + a(i)
     end do
  end do
  call cpu_time(stop)
  print *, "Serial Scan Output: "
  print *, serial_scan
  print *, "Time taken in seconds is ",stop-start
  !SIMD Scan with "+" operator
  call cpu_time(start)
  do j = 1, ntimes
     scn_a = 0
     !$omp simd reduction(inscan, +:scn_a)
     do i = 1, n
        simd_scan(i) = scn_a
        !$omp scan exclusive(scn_a)
        scn_a =  scn_a + a(i)
     end do
  end do
  call cpu_time(stop)
  print *,"SIMD Scan Output: "
  print *, simd_scan
  print *, "Time taken in seconds is ",stop-start

end program exclusive

INCLUSIVE SCAN

program inclusive
  implicit none
  integer, parameter   :: n = 16
  integer, parameter   :: ntimes = 10000000  ! iterate enough times to accumulate some CPU time
  integer              :: a(N), serial_scan(N), simd_scan(N), scn_a
  integer              :: i, j
  real(8)              :: start, stop
  !initialize
  do i = 1, n
     a(i) = i-1
  enddo
  serial_scan = 0
  simd_scan = 0
  !Serial Scan with "+" operator
  call cpu_time(start)
  do j = 1, ntimes
     scn_a = 0
     do i = 1, n
        scn_a = scn_a + a(i)
        serial_scan(i) = scn_a
     end do
  end do
  call cpu_time(stop)
  print *, "Serial Scan Output: "
  print *, serial_scan
  print *, "Time taken in seconds is ",stop-start
  !SIMD Scan with "+" operator
  call cpu_time(start)
  do j = 1, ntimes
     scn_a = 0
     !$omp simd reduction(inscan, +:scn_a)
     do i = 1, n
        scn_a =  scn_a + a(i)
        !$omp scan inclusive(scn_a)
        simd_scan(i) = scn_a
     end do
  end do
  call cpu_time(stop)
  print *,"SIMD Scan Output: "
  print *, simd_scan
  print *, "Time taken in seconds is ",stop-start

end program inclusive

 

REFERENCES

Explicit Vector Programming in Intel® oneAPI DPC++/C++ Compiler Developer Guide and Reference

SIMD Directive for OpenMP in Intel Fortran Developer Guide and Reference

 

Notices and Disclaimers

Intel technologies may require enabled hardware, software or service activation.

No product or component can be absolutely secure.

Your costs and results may vary.

© Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others.

No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document.

The products described may contain design defects or errors known as errata which may cause the product to deviate from published specifications. Current characterized errata are available on request.

Intel disclaims all express and implied warranties, including without limitation, the implied warranties of merchantability, fitness for a particular purpose, and non-infringement, as well as any warranty arising from course of performance, course of dealing, or usage in trade.