Data transfer of non-contiguous array elements using the Intel® Language Extensions for Offload (LEO) for the Intel® Xeon Phi™ coprocessor

The Intel® Parallel Studio XE 2015 Composer Editions for C++ Windows* and Linux* have a feature enhancement supporting data transfer for non-contiguous array elements with the Intel® Language Extensions for Offload (LEO) for the Intel® Xeon Phi™ coprocessor.

The feature adds support under the LEO offload data marshalling model for transferring non-contiguous array elements within an array variable reference (variable-ref) in the data transfer clauses (i.e. in, out, inout, nocopy) of the #pragma offload/offload_transfer statement.

Under the offload data marshalling model, each data transfer clause (in, out, inout, nocopy) shares a common basic syntax shown in the details below. The feature enhancement enables specifying a value for stride in the c-shape specification described below.

Syntax:
            #pragma offload clause [ clause …]
            #pragma offload_transfer clause [ clause …]

            Where clause may include the data transfer clauses of:

                        in ( variable-ref [, variable-ref …] [ modifier [ modifier … ] ] )
                        out ( variable-ref [, variable-ref …] [ modifier [ modifier … ] ] )
                        inout ( variable-ref [, variable-ref …] [ modifier [ modifier … ] ] )
                        nocopy ( variable-ref [, variable-ref …] [ modifier [ modifier … ] ] )

             And variable-ref is:

                         identifier
                         variable-ref , identifier
                                     Use the following syntax for variable-ref
                                          •    variable-name : length ( number-of-elements )
                         variable-ref [ c-shape ]
                                     Use the following syntax for variable-ref
                                          •    variable-name [ start : number-of-elements ]   denotes contiguous
                                               set of array elements
                                          •    variable-name [ start :  number-of-elements : stride ]   denotes either
                                                contiguous or non-contiguous set of array elements
             And modifier is:

                          Unchanged by the feature enhancement. Refer to the User and Reference Guide
                          for the Intel® C++ Compiler 15.0
for details.

The following example illustrates the use of the feature enhancement with non-unit stride with various data movement clauses and modifiers.

--------------------------------------------------------------------------

Example:
#include <stdio.h>

#define ALLOC    alloc_if(1)
#define No_ALLOC alloc_if(0)
#define FREE     free_if(1)
#define No_FREE  free_if(0)
#define REUSE    alloc_if(0) free_if(0)

__declspec( target (mic)) int *a, *b, *c, *d;
__declspec( target (mic)) int n=16;

__declspec( target (mic))
void print_array(char *str,int * array,int start,int count)
{
   printf(str);
   printf(" %d",array[start:count]);
   printf("\n");
   fflush(0);
}

void print_header(char *str)
{
   int i;

   printf(str);
   for (i=1;i < strlen(str); i++)
       printf("=");

   printf("\n");
   fflush(0);
}


void initialize()
{
   int i;

   for (i = 0; i < n; i++)
      a[i]=i+2;

   b[0:n]=1;
   c[0:n]=a[0:n];
   d[0:2*n]=0;
}

void IN_with_stride()
{
   int i;
   char msg[20]="";

   print_header("Illustrate IN with non-unit stride\n");
   print_array("host : a =",a,0,n);

   // Allocate space for a only, allocate/transfer b
   #pragma offload_transfer target(mic:0) mandatory \
                                 nocopy(a : length(n) ALLOC No_FREE) \
                                 in(b : length(n) ALLOC No_FREE)

   // Transfer 1/2 of the values of array a with non-unit stride
   #pragma offload target(mic:0) mandatory \
                                 in(a[0:n/2:2] : REUSE ) \
                                 nocopy(b : REUSE)
   {
      sprintf(msg,"-> mic%d : b (before) =",_Offload_get_device_number());
      print_array(msg,b,0,n);

      for (i = 0; i < n; i++)
          b[i] = a[i];

      sprintf(msg,"-> mic%d : a =",_Offload_get_device_number());
      print_array(msg,a,0,n);

      sprintf(msg,"-> mic%d : b (after) =",_Offload_get_device_number());
      print_array(msg,b,0,n);
   }

   // Free allocations
   #pragma offload_transfer target(mic:0) mandatory \
                                  nocopy(a,b : No_ALLOC FREE)

   printf("\n");
}


void IN_with_ALLOC_with_stride()
{
   int i,l,cnt,s;
   char msg[20]="";

   print_header("Illustrate IN with ALLOC with non-unit stride\n");

   print_array("host : c =",c,0,n);

   l = 0;
   cnt = n/2;
   s = 2;

   // Allocate partial array and transfer non-unit stride elements
   // Ensure the number of elements transferred plus the stride
   // does not exceed the size of the partial allocation
   #pragma offload target(mic:0) mandatory \
                   in(c[l:(cnt/s)+(s%2):s] : alloc (c[l:cnt]) ALLOC FREE)
   {
      sprintf(msg,"-> mic%d : c =",_Offload_get_device_number());
      print_array(msg,c,l,cnt);
   }

   printf("\n");
}


void INTO_with_stride()
{
   int i,l,cnt,s;
   char msg[20]="";

   print_header("Illustrate INTO with non-unit stride\n");

   c[0:n]=a[0:n];

   print_array("host : a =",a,0,n);
   print_array("host : c (before) =",c,0,n);
   print_array("host : d[0:n] =",d,0,n);
   print_array("host : d[n:n] =",d,n,n);

   l = n/2;
   cnt = n/2;
   s = 2;

   // Allocate d only
   #pragma offload_transfer target(mic:0) mandatory \
                                   nocopy(d : length(2*n) ALLOC No_FREE)

   // Transfer a elements with non-unit stride into d on coprocessor only
   #pragma offload target(mic:0) mandatory \
                                 in(a[0:cnt:s] : into (d[n:cnt:s]) REUSE)
   {
      sprintf(msg,"-> mic%d : d[0:n] =",_Offload_get_device_number());
      print_array(msg,d,0,n);
      sprintf(msg,"-> mic%d : d[n:n] =",_Offload_get_device_number());
      print_array(msg,d,n,n);
   }

   // Transfer d elements with non-unit stride into c on host only
   // Free the allocation
   #pragma offload_transfer target(mic:0) mandatory \
                            out(d[n:cnt:s] : into (c[1:cnt:s]) No_ALLOC FREE)

   print_array("host : c (after) =",c,0,n);
   printf("\n");
}


int main(int argc, char* argv[])
{
   a = (int *) _mm_malloc(n*sizeof(int), 64);
   b = (int *) _mm_malloc(n*sizeof(int), 64);
   c = (int *) _mm_malloc(n*sizeof(int), 64);
   d = (int *) _mm_malloc((2*n)*sizeof(int), 64);

   initialize();
   IN_with_stride();
   IN_with_ALLOC_with_stride();
   INTO_with_stride();
}

 

For more complete information about compiler optimizations, see our Optimization Notice.