OpenMP* SIMD for Inclusive/Exclusive Scans

With Intel® C++ Compiler 19.0 and above and with Intel® Fortran Compiler 19.1 and above, we have support for the SIMD implementation of the inclusive and exclusive scan. Starting with OpenMP* Version 5.0, the reduction clause supports scans patterns.

Explicit syntax for inclusive scan using C++

#pragma omp simd reduction[parallel](inscan, operator:list)
#pragma omp scan inclusive(item-list)

Explicit syntax for inclusive scan using Fortran

!$omp simd reduction(inscan, operator : list)
!$omp scan inclusive(item-list)

Explicit syntax for exclusive scan using C++

#pragma omp simd reduction[parallel](inscan, operator:list)
#pragma omp scan exclusive(item-list)

Explicit syntax for exclusive scan using Fortran

!$omp simd reduction(inscan, operator : list)
!$omp scan exclusive(item-list)

 

Below is a C++ code snippet which uses prefix sum with SIMD scan feature. The implementation offers serial version of the code, as well as, SIMD version of both Inclusive and Exclusive scans.

Exclusive ScanInclusive Scan
#include<iostream>
#include<math.h>
#include<string.h>
#include<chrono>
#define N 16
using namespace std;
int main(){
        int a[N], serial_scan[N], naive_scan[N], simd_scan[N], scan_a;
        for(int i = 0; i < N; i++){
                a[i] = i;
                serial_scan[i] = 0;
                simd_scan[i] = 0;
        }
        //Serial Scan with "+" operator
        auto start = std::chrono::system_clock::now();
        scan_a = 0;
        for(int i = 0; i < N; i++){
                serial_scan[i] = scan_a;
                scan_a += a[i];
        }
        auto stop = std::chrono::system_clock::now();
        std::cout<<"Serial Scan Output:\n";
        for(int i = 0; i < N; i++)
                std::cout<<serial_scan[i]<<"\t";
        std::cout<<"\n";
        std::chrono::duration<double> elapsed_seconds = stop-start;
        std::cout<<"Time taken in seconds is "<<elapsed_seconds.count()<<"\n";
        //SIMD Scan with "+" operator
        start = std::chrono::system_clock::now();
        scan_a = 0;
        #pragma omp simd reduction(inscan, +:scan_a)
        for(int i = 0; i < N; i++){
                simd_scan[i] = scan_a;
                #pragma omp scan exclusive(scan_a)
                scan_a += a[i];
        }
        stop = std::chrono::system_clock::now();
        std::cout<<"SIMD Scan Output:\n";
        for(int i = 0; i < N; i++)
                std::cout<<simd_scan[i]<<"\t";
        std::cout<<"\n";
        elapsed_seconds = stop - start;
        std::cout<<"Time taken in seconds is "<<elapsed_seconds.count()<<"\n";
        return 0;
}

 

#include<iostream>
#include<math.h>
#include<string.h>
#include<chrono>
#define N 16
using namespace std;
int main(){
        int a[N], serial_scan[N], naive_scan[N], simd_scan[N], scan_a;
        for(int i = 0; i < N; i++){
                a[i] = i;
                serial_scan[i] = 0;
                simd_scan[i] = 0;
        }
        //Serial Scan with "+" operator
        auto start = std::chrono::system_clock::now();
        scan_a = 0;
        for(int i = 0; i < N; i++){
                scan_a += a[i];
                serial_scan[i] = scan_a;
        }
        auto stop = std::chrono::system_clock::now();
        std::cout<<"Serial Scan Output:\n";
        for(int i = 0; i < N; i++)
                std::cout<<serial_scan[i]<<"\t";
        std::cout<<"\n";
        std::chrono::duration<double> elapsed_seconds = stop-start;
        std::cout<<"Time taken in seconds is "<<elapsed_seconds.count()<<"\n";
        //SIMD Scan with "+" operator
        start = std::chrono::system_clock::now();
        scan_a = 0;
        #pragma omp simd reduction(inscan, +:scan_a)
        for(int i = 0; i < N; i++){
                scan_a += a[i];
                #pragma omp scan inclusive(scan_a)
                simd_scan[i] = scan_a;
        }
        stop = std::chrono::system_clock::now();
        std::cout<<"SIMD Scan Output:\n";
        for(int i = 0; i < N; i++)
                std::cout<<simd_scan[i]<<"\t";
        std::cout<<"\n";
        elapsed_seconds = stop - start;
        std::cout<<"Time taken in seconds is "<<elapsed_seconds.count()<<"\n";
        return 0;
}

Here is the Fortran version of the same code snippet.

Exclusive ScanInclusive Scan
program exclusive
 implicit none
 integer, parameter   :: n = 16
 integer, parameter   :: ntimes = 10000000  ! iterate enough times to accumulate
 some CPU time
 integer              :: a(N), serial_scan(N), naive_scan(N), simd_scan(N), scn_
a
 integer              :: i, j
 real(8)              :: start, stop

!!! initialize
 do i = 1, n
   a(i) = i-1
 end do
 serial_scan = 0
 simd_scan = 0

!!! Serial Scan with "+" operator
 call cpu_time(start)
 do j = 1, ntimes
   scn_a = 0
   do i = 1, n
     serial_scan(i) = scn_a
     scn_a = scn_a + a(i)
   end do
 end do
 call cpu_time(stop)
 print *, "Serial Scan Output: "
 print *, serial_scan
 print *, "Time taken in seconds is ",stop-start

!!! SIMD Scan with "+" operator
 call cpu_time(start)
 do j = 1, ntimes
   scn_a = 0
!$omp simd reduction(inscan, +:scn_a)
   do i = 1, n
     simd_scan(i) = scn_a
!$omp scan exclusive(scn_a)
     scn_a =  scn_a + a(i)
   end do
 end do
 call cpu_time(stop)
 print *,"SIMD Scan Output: "
 print *, simd_scan
 print *, "Time taken in seconds is ",stop-start

 stop
 end
 
 program inclusive
 implicit none
 integer, parameter   :: n = 16
 integer, parameter   :: ntimes = 10000000  ! iterate enough times to accumulate
 some CPU time
 integer              :: a(N), serial_scan(N), naive_scan(N), simd_scan(N), scn_
a
 integer              :: i, j
 real(8)              :: start, stop

!!! initialize
 do i = 1, n
   a(i) = i-1
 end do
 serial_scan = 0
 simd_scan = 0

!!! Serial Scan with "+" operator
 call cpu_time(start)
 do j = 1, ntimes
 scn_a = 0
   do i = 1, n
     scn_a = scn_a + a(i)
     serial_scan(i) = scn_a
   end do
 end do
 call cpu_time(stop)
 print *, "Serial Scan Output: "
 print *, serial_scan
 print *, "Time taken in seconds is ",stop-start

!!! SIMD Scan with "+" operator
 call cpu_time(start)
 do j = 1, ntimes
 scn_a = 0
!$omp simd reduction(inscan, +:scn_a)
   do i = 1, n
     scn_a =  scn_a + a(i)
!$omp scan inclusive(scn_a)
     simd_scan(i) = scn_a
   end do
 end do
 call cpu_time(stop)
 print *, "SIMD Scan Output: "
 print *, simd_scan
 print *, "Time taken in seconds is ",stop-start
 stop
 end

The single threaded SIMD implementation runs ~2x faster when targeting Intel® Advanced Vector Extensions (Intel® AVX2) in comparison serial, non-vector implementation for both C++ and Fortran implementations.

Machine Specification:
Processor: Intel® Xeon® CPU E7-4850 v3 @ 2.20GHz
RAM: 512 GB
Compiler Version: Intel® C++ Compiler 19.0
Compiler Flags: -std=c++11 -xCORE-AVX2
Date of performance run: 7/11/2019

***Performance results are based on testing as of 7/11/2019 and may not reflect all publicly available security updates. See configuration disclosure for details. No product can be absolutely secure.

 

 

 

Para obter informações mais completas sobre otimizações do compilador, consulte nosso aviso de otimização.