在英特尔® 至强融核™ 协处理器上使用面向卸载的英特尔® 语言扩展 (LEO) 在非连续阵列元素之间传输数据

面向 C++ Windows* 和 Linux* 的英特尔® Parallel Studio XE 2015 编译器版本提供了一款增强功能,支持在英特尔® 至强融核™ 协处理器上使用面向卸载的英特尔® 语言扩展 (LEO) 在非连续阵列元素之间传输数据。

该功能在 LEO 卸载数据编组模型下添加了支持,以便使用 #pragma offload/offload_transfer 语句的数据传输子句(如 in、out、inout、nocopy)在阵列变量引用 (variable-ref) 中传输非连续阵列元素。

在下载数据编组模型下,每个数据传输子句 (in、out、inout、nocopy) 共用一个通用的基本语法,具体见下文。 该增强功能支持以 c-shape 规范为步长指定一个值,具体如下。

语法:
            #pragma offloadclause [ clause …]
            #pragma offload_transferclause [ clause …]

         其中子句应包括以下内容的数据传输子句:

                        in (variable-ref [, variable-ref …] [ modifier [ modifier … ] ])
                        out (variable-ref [, variable-ref …] [ modifier [ modifier … ] ])
                        inout (variable-ref [, variable-ref …] [ modifier [ modifier … ] ])
                        nocopy (variable-ref [, variable-ref …] [ modifier [ modifier … ] ])

          variable-ref 是:

                         identifier
                         variable-ref , identifier
                                   为 variable-ref 使用以下语法:
                                          •    variable-name: length ( number-of-elements)
                         variable-ref[c-shape]
                                      为 variable-ref 使用以下语法
                                          •    variable-name [ start :number-of-elements ]   表示连续的
                                               阵列元素集
                                          •    variable-name[start :  number-of-elements:stride]   表示
                                                连续或非连续的阵列元素集
             modifier 是:

                    增强功能未改变的内容。 参阅英特尔® C++ Compiler 15.0 用户和参考指南,了解具体信息。

以下示例介绍了增强功能如何使用包含多种数据移动子句和修饰符 (modifier) 的非单位步长。

--------------------------------------------------------------------------

示例:

#include <stdio.h>#define ALLOC    alloc_if(1)#define No_ALLOC alloc_if(0)#define FREE     free_if(1)#define No_FREE  free_if(0)#define REUSE    alloc_if(0) free_if(0)__declspec( target (mic)) int *a, *b, *c, *d;__declspec( target (mic)) int n=16;__declspec( target (mic))void print_array(char *str,int * array,int start,int count){   printf(str);   printf(" %d",array[start:count]);   printf("\n");   fflush(0);}void print_header(char *str){   int i;   printf(str);   for (i=1;i < strlen(str); i++)       printf("=");   printf("\n");   fflush(0);}void initialize(){   int i;   for (i = 0; i < n; i++)      a[i]=i+2;   b[0:n]=1;   c[0:n]=a[0:n];   d[0:2*n]=0;}void IN_with_stride(){   int i;   char msg[20]="";   print_header("Illustrate IN with non-unit stride\n");   print_array("host : a =",a,0,n);   // Allocate space for a only, allocate/transfer b   #pragma offload_transfer target(mic:0) mandatory \                                 nocopy(a : length(n) ALLOC No_FREE) \                                 in(b : length(n) ALLOC No_FREE)   // Transfer 1/2 of the values of array a with non-unit stride   #pragma offload target(mic:0) mandatory \                                 in(a[0:n/2:2] : REUSE ) \                                 nocopy(b : REUSE)   {      sprintf(msg,"-> mic%d : b (before) =",_Offload_get_device_number());      print_array(msg,b,0,n);      for (i = 0; i < n; i++)          b[i] = a[i];      sprintf(msg,"-> mic%d : a =",_Offload_get_device_number());      print_array(msg,a,0,n);      sprintf(msg,"-> mic%d : b (after) =",_Offload_get_device_number());      print_array(msg,b,0,n);   }   // Free allocations   #pragma offload_transfer target(mic:0) mandatory \                                  nocopy(a,b : No_ALLOC FREE)   printf("\n");}void IN_with_ALLOC_with_stride(){   int i,l,cnt,s;   char msg[20]="";   print_header("Illustrate IN with ALLOC with non-unit stride\n");   print_array("host : c =",c,0,n);   l = 0;   cnt = n/2;   s = 2;   // Allocate partial array and transfer non-unit stride elements   // Ensure the number of elements transferred plus the stride   // does not exceed the size of the partial allocation   #pragma offload target(mic:0) mandatory \                   in(c[l:(cnt/s)+(s%2):s] : alloc (c[l:cnt]) ALLOC FREE)   {      sprintf(msg,"-> mic%d : c =",_Offload_get_device_number());      print_array(msg,c,l,cnt);   }   printf("\n");}void INTO_with_stride(){   int i,l,cnt,s;   char msg[20]="";   print_header("Illustrate INTO with non-unit stride\n");   c[0:n]=a[0:n];   print_array("host : a =",a,0,n);   print_array("host : c (before) =",c,0,n);   print_array("host : d[0:n] =",d,0,n);   print_array("host : d[n:n] =",d,n,n);   l = n/2;   cnt = n/2;   s = 2;   // Allocate d only   #pragma offload_transfer target(mic:0) mandatory \                                   nocopy(d : length(2*n) ALLOC No_FREE)   // Transfer a elements with non-unit stride into d on coprocessor only   #pragma offload target(mic:0) mandatory \                                 in(a[0:cnt:s] : into (d[n:cnt:s]) REUSE)   {      sprintf(msg,"-> mic%d : d[0:n] =",_Offload_get_device_number());      print_array(msg,d,0,n);      sprintf(msg,"-> mic%d : d[n:n] =",_Offload_get_device_number());      print_array(msg,d,n,n);   }   // Transfer d elements with non-unit stride into c on host only   // Free the allocation   #pragma offload_transfer target(mic:0) mandatory \                            out(d[n:cnt:s] : into (c[1:cnt:s]) No_ALLOC FREE)   print_array("host : c (after) =",c,0,n);   printf("\n");}int main(int argc, char* argv[]){   a = (int *) _mm_malloc(n*sizeof(int), 64);   b = (int *) _mm_malloc(n*sizeof(int), 64);   c = (int *) _mm_malloc(n*sizeof(int), 64);   d = (int *) _mm_malloc((2*n)*sizeof(int), 64);   initialize();   IN_with_stride();   IN_with_ALLOC_with_stride();   INTO_with_stride();}

 

有关编译器优化的更完整信息,请参阅优化通知