Data transfer of non-contiguous array elements using the Intel® Language Extensions for Offload (LEO) for the Intel® Xeon Phi™ coprocessor

The Intel® Parallel Studio XE 2015 Composer Editions for C++ Windows* and Linux* have a feature enhancement supporting data transfer for non-contiguous array elements with the Intel® Language Extensions for Offload (LEO) for the Intel® Xeon Phi™ coprocessor.

The feature adds support under the LEO offload data marshalling model for transferring non-contiguous array elements within an array variable reference (variable-ref) in the data transfer clauses (i.e. in, out, inout, nocopy) of the #pragma offload/offload_transfer statement.

Under the offload data marshalling model, each data transfer clause (in, out, inout, nocopy) shares a common basic syntax shown in the details below. The feature enhancement enables specifying a value for stride in the c-shape specification described below.

            #pragma offloadclause [ clause …]
            #pragma offload_transferclause [ clause …]

            Where clause may include the data transfer clauses of:

                        in (variable-ref [, variable-ref …] [ modifier [ modifier … ] ])
                        out (variable-ref [, variable-ref …] [ modifier [ modifier … ] ])
                        inout (variable-ref [, variable-ref …] [ modifier [ modifier … ] ])
                        nocopy (variable-ref [, variable-ref …] [ modifier [ modifier … ] ])

             And variable-ref is:

                         variable-ref , identifier
                                     Use the following syntax for variable-ref
                                          •    variable-name: length ( number-of-elements)
                                     Use the following syntax for variable-ref
                                          •    variable-name [ start :number-of-elements ]   denotes contiguous
                                               set of array elements
                                          •    variable-name[start :  number-of-elements:stride]   denotes either
                                                contiguous or non-contiguous set of array elements
             And modifier is:

                          Unchanged by the feature enhancement. Refer to the User and Reference Guide
                          for the Intel® C++ Compiler 15.0
for details.

The following example illustrates the use of the feature enhancement with non-unit stride with various data movement clauses and modifiers.


#include <stdio.h>#define ALLOC    alloc_if(1)#define No_ALLOC alloc_if(0)#define FREE     free_if(1)#define No_FREE  free_if(0)#define REUSE    alloc_if(0) free_if(0)__declspec( target (mic)) int *a, *b, *c, *d;__declspec( target (mic)) int n=16;__declspec( target (mic))void print_array(char *str,int * array,int start,int count){   printf(str);   printf(" %d",array[start:count]);   printf("\n");   fflush(0);}void print_header(char *str){   int i;   printf(str);   for (i=1;i < strlen(str); i++)       printf("=");   printf("\n");   fflush(0);}void initialize(){   int i;   for (i = 0; i < n; i++)      a[i]=i+2;   b[0:n]=1;   c[0:n]=a[0:n];   d[0:2*n]=0;}void IN_with_stride(){   int i;   char msg[20]="";   print_header("Illustrate IN with non-unit stride\n");   print_array("host : a =",a,0,n);   // Allocate space for a only, allocate/transfer b   #pragma offload_transfer target(mic:0) mandatory \                                 nocopy(a : length(n) ALLOC No_FREE) \                                 in(b : length(n) ALLOC No_FREE)   // Transfer 1/2 of the values of array a with non-unit stride   #pragma offload target(mic:0) mandatory \                                 in(a[0:n/2:2] : REUSE ) \                                 nocopy(b : REUSE)   {      sprintf(msg,"-> mic%d : b (before) =",_Offload_get_device_number());      print_array(msg,b,0,n);      for (i = 0; i < n; i++)          b[i] = a[i];      sprintf(msg,"-> mic%d : a =",_Offload_get_device_number());      print_array(msg,a,0,n);      sprintf(msg,"-> mic%d : b (after) =",_Offload_get_device_number());      print_array(msg,b,0,n);   }   // Free allocations   #pragma offload_transfer target(mic:0) mandatory \                                  nocopy(a,b : No_ALLOC FREE)   printf("\n");}void IN_with_ALLOC_with_stride(){   int i,l,cnt,s;   char msg[20]="";   print_header("Illustrate IN with ALLOC with non-unit stride\n");   print_array("host : c =",c,0,n);   l = 0;   cnt = n/2;   s = 2;   // Allocate partial array and transfer non-unit stride elements   // Ensure the number of elements transferred plus the stride   // does not exceed the size of the partial allocation   #pragma offload target(mic:0) mandatory \                   in(c[l:(cnt/s)+(s%2):s] : alloc (c[l:cnt]) ALLOC FREE)   {      sprintf(msg,"-> mic%d : c =",_Offload_get_device_number());      print_array(msg,c,l,cnt);   }   printf("\n");}void INTO_with_stride(){   int i,l,cnt,s;   char msg[20]="";   print_header("Illustrate INTO with non-unit stride\n");   c[0:n]=a[0:n];   print_array("host : a =",a,0,n);   print_array("host : c (before) =",c,0,n);   print_array("host : d[0:n] =",d,0,n);   print_array("host : d[n:n] =",d,n,n);   l = n/2;   cnt = n/2;   s = 2;   // Allocate d only   #pragma offload_transfer target(mic:0) mandatory \                                   nocopy(d : length(2*n) ALLOC No_FREE)   // Transfer a elements with non-unit stride into d on coprocessor only   #pragma offload target(mic:0) mandatory \                                 in(a[0:cnt:s] : into (d[n:cnt:s]) REUSE)   {      sprintf(msg,"-> mic%d : d[0:n] =",_Offload_get_device_number());      print_array(msg,d,0,n);      sprintf(msg,"-> mic%d : d[n:n] =",_Offload_get_device_number());      print_array(msg,d,n,n);   }   // Transfer d elements with non-unit stride into c on host only   // Free the allocation   #pragma offload_transfer target(mic:0) mandatory \                            out(d[n:cnt:s] : into (c[1:cnt:s]) No_ALLOC FREE)   print_array("host : c (after) =",c,0,n);   printf("\n");}int main(int argc, char* argv[]){   a = (int *) _mm_malloc(n*sizeof(int), 64);   b = (int *) _mm_malloc(n*sizeof(int), 64);   c = (int *) _mm_malloc(n*sizeof(int), 64);   d = (int *) _mm_malloc((2*n)*sizeof(int), 64);   initialize();   IN_with_stride();   IN_with_ALLOC_with_stride();   INTO_with_stride();}


For more complete information about compiler optimizations, see our Optimization Notice.