Data transfer of non-contiguous array elements using the Intel® Language Extensions for Offload (LEO) for the Intel® Xeon Phi™ coprocessor

Published: 08/22/2014, Last Updated: 08/22/2014

The Intel® Parallel Studio XE 2015 Composer Editions for C++ Windows* and Linux* have a feature enhancement supporting data transfer for non-contiguous array elements with the Intel® Language Extensions for Offload (LEO) for the Intel® Xeon Phi™ coprocessor.

The feature adds support under the LEO offload data marshalling model for transferring non-contiguous array elements within an array variable reference (variable-ref) in the data transfer clauses (i.e. in, out, inout, nocopy) of the #pragma offload/offload_transfer statement.

Under the offload data marshalling model, each data transfer clause (in, out, inout, nocopy) shares a common basic syntax shown in the details below. The feature enhancement enables specifying a value for stride in the c-shape specification described below.

Syntax:
            #pragma offload clause [ clause …]
            #pragma offload_transfer clause [ clause …]

            Where clause may include the data transfer clauses of:

                        in ( variable-ref [, variable-ref …] [ modifier [ modifier … ] ] )
                        out ( variable-ref [, variable-ref …] [ modifier [ modifier … ] ] )
                        inout ( variable-ref [, variable-ref …] [ modifier [ modifier … ] ] )
                        nocopy ( variable-ref [, variable-ref …] [ modifier [ modifier … ] ] )

             And variable-ref is:

                         identifier
                         variable-ref , identifier
                                     Use the following syntax for variable-ref
                                          •    variable-name : length ( number-of-elements )
                         variable-ref [ c-shape ]
                                     Use the following syntax for variable-ref
                                          •    variable-name [ start : number-of-elements ]   denotes contiguous
                                               set of array elements
                                          •    variable-name [ start :  number-of-elements : stride ]   denotes either
                                                contiguous or non-contiguous set of array elements
             And modifier is:

                          Unchanged by the feature enhancement. Refer to the User and Reference Guide
                          for the Intel® C++ Compiler 15.0
for details.

The following example illustrates the use of the feature enhancement with non-unit stride with various data movement clauses and modifiers.

--------------------------------------------------------------------------

Example:
#include <stdio.h>

#define ALLOC    alloc_if(1)
#define No_ALLOC alloc_if(0)
#define FREE     free_if(1)
#define No_FREE  free_if(0)
#define REUSE    alloc_if(0) free_if(0)

__declspec( target (mic)) int *a, *b, *c, *d;
__declspec( target (mic)) int n=16;

__declspec( target (mic))
void print_array(char *str,int * array,int start,int count)
{
   printf(str);
   printf(" %d",array[start:count]);
   printf("\n");
   fflush(0);
}

void print_header(char *str)
{
   int i;

   printf(str);
   for (i=1;i < strlen(str); i++)
       printf("=");

   printf("\n");
   fflush(0);
}


void initialize()
{
   int i;

   for (i = 0; i < n; i++)
      a[i]=i+2;

   b[0:n]=1;
   c[0:n]=a[0:n];
   d[0:2*n]=0;
}

void IN_with_stride()
{
   int i;
   char msg[20]="";

   print_header("Illustrate IN with non-unit stride\n");
   print_array("host : a =",a,0,n);

   // Allocate space for a only, allocate/transfer b
   #pragma offload_transfer target(mic:0) mandatory \
                                 nocopy(a : length(n) ALLOC No_FREE) \
                                 in(b : length(n) ALLOC No_FREE)

   // Transfer 1/2 of the values of array a with non-unit stride
   #pragma offload target(mic:0) mandatory \
                                 in(a[0:n/2:2] : REUSE ) \
                                 nocopy(b : REUSE)
   {
      sprintf(msg,"-> mic%d : b (before) =",_Offload_get_device_number());
      print_array(msg,b,0,n);

      for (i = 0; i < n; i++)
          b[i] = a[i];

      sprintf(msg,"-> mic%d : a =",_Offload_get_device_number());
      print_array(msg,a,0,n);

      sprintf(msg,"-> mic%d : b (after) =",_Offload_get_device_number());
      print_array(msg,b,0,n);
   }

   // Free allocations
   #pragma offload_transfer target(mic:0) mandatory \
                                  nocopy(a,b : No_ALLOC FREE)

   printf("\n");
}


void IN_with_ALLOC_with_stride()
{
   int i,l,cnt,s;
   char msg[20]="";

   print_header("Illustrate IN with ALLOC with non-unit stride\n");

   print_array("host : c =",c,0,n);

   l = 0;
   cnt = n/2;
   s = 2;

   // Allocate partial array and transfer non-unit stride elements
   // Ensure the number of elements transferred plus the stride
   // does not exceed the size of the partial allocation
   #pragma offload target(mic:0) mandatory \
                   in(c[l:(cnt/s)+(s%2):s] : alloc (c[l:cnt]) ALLOC FREE)
   {
      sprintf(msg,"-> mic%d : c =",_Offload_get_device_number());
      print_array(msg,c,l,cnt);
   }

   printf("\n");
}


void INTO_with_stride()
{
   int i,l,cnt,s;
   char msg[20]="";

   print_header("Illustrate INTO with non-unit stride\n");

   c[0:n]=a[0:n];

   print_array("host : a =",a,0,n);
   print_array("host : c (before) =",c,0,n);
   print_array("host : d[0:n] =",d,0,n);
   print_array("host : d[n:n] =",d,n,n);

   l = n/2;
   cnt = n/2;
   s = 2;

   // Allocate d only
   #pragma offload_transfer target(mic:0) mandatory \
                                   nocopy(d : length(2*n) ALLOC No_FREE)

   // Transfer a elements with non-unit stride into d on coprocessor only
   #pragma offload target(mic:0) mandatory \
                                 in(a[0:cnt:s] : into (d[n:cnt:s]) REUSE)
   {
      sprintf(msg,"-> mic%d : d[0:n] =",_Offload_get_device_number());
      print_array(msg,d,0,n);
      sprintf(msg,"-> mic%d : d[n:n] =",_Offload_get_device_number());
      print_array(msg,d,n,n);
   }

   // Transfer d elements with non-unit stride into c on host only
   // Free the allocation
   #pragma offload_transfer target(mic:0) mandatory \
                            out(d[n:cnt:s] : into (c[1:cnt:s]) No_ALLOC FREE)

   print_array("host : c (after) =",c,0,n);
   printf("\n");
}


int main(int argc, char* argv[])
{
   a = (int *) _mm_malloc(n*sizeof(int), 64);
   b = (int *) _mm_malloc(n*sizeof(int), 64);
   c = (int *) _mm_malloc(n*sizeof(int), 64);
   d = (int *) _mm_malloc((2*n)*sizeof(int), 64);

   initialize();
   IN_with_stride();
   IN_with_ALLOC_with_stride();
   INTO_with_stride();
}

 

Product and Performance Information

1

Intel's compilers may or may not optimize to the same degree for non-Intel microprocessors for optimizations that are not unique to Intel microprocessors. These optimizations include SSE2, SSE3, and SSSE3 instruction sets and other optimizations. Intel does not guarantee the availability, functionality, or effectiveness of any optimization on microprocessors not manufactured by Intel. Microprocessor-dependent optimizations in this product are intended for use with Intel microprocessors. Certain optimizations not specific to Intel microarchitecture are reserved for Intel microprocessors. Please refer to the applicable product User and Reference Guides for more information regarding the specific instruction sets covered by this notice.

Notice revision #20110804