如何实现最高传输速率

Chapter Title Goes Here, Sub Chapter Title

此处插入章节标题,子章节标题

面向英特尔® MIC 架构的编译器方法

选择编程模式如何实现最高传输速率

概述

下面的简单示例展示了如何测量最优的数据传输速率。该示例介绍了如何使用 _mm_malloc() _mm_free() 来替代 malloc() free(),以便分配和释放在 4K 边界上对齐的数据,这对于向英特尔® 至强融核协处理器的 DMA 传输是最理想的。 该示例未提供实际数据速率,只是展示实现高效数据传输所使用的技术。

主题

必须在确保 4K 对齐的情况下分配数据以获得最佳的 DMA 性能。DMA 在通过 PCIe 向英特尔® 至强融核协处理器传输数据时可实现较高的效率。

在定时循环之前使用 _mm_malloc() MIC 分配数据。在循环内部传输数据时请使用 free_if(0) alloc_if(0)。下面提供了一个简单的代码版本。

如何运行代码:

**************

-bash-4.1$ icc -offload-build bwtest.c

-bash-4.1$ ./a.out -h

使用

./a.out -h -a <buffer alignment> -d <device ID> -n <number of iterations>

-bash-4.1$ ./a.out

Bandwidth test. Buffer alignment: 4096. DeviceID: 0. Number of iterations: 20.

       

          Size(Bytes)      Send(Bytes/sec)   Receive(Bytes/sec)

<your results will be shown here>

-bash-4.1$

****************

[p:/] cat bwtest.c

#include <stdio.h>

#include <stdlib.h>

#include <unistd.h>

#include <sys/time.h>

#include <ia32intrin.h>

 

/* buffer alignment */

static int align = 4096;

 

/* device id */

static int device = 0;

 

/* number of interations in benchmarking loop */

static int niters = 20;

 

/* CPU buffer */

__declspec(target(mic))

static char* buf;

 

/* buffer sizes */

static const int bufsizes[] =

{

    4096,

    8192,

    16384,

    32768,

    65536,

    131072,

    262144,

    524288,

    1048576,

    2097152,

    4194304,

    8388608,

    16777216,

    33554432,

    67108864,

    134217728,

    268435456,

    536870912,

    0

};

 

static void parse_options(int argc, char** argv)

{

    int opt;

 

    while ((opt = getopt(argc, argv, "ha:d:n:")) != -1) {

        switch (opt) {

            case 'a':

                align = atoi(optarg);

                if (align <= 0 || align & (align-1) != 0) {

                    printf("Invalid alignment %d\n", align);

                    exit(1);

                }

                break;

 

            case 'd':

                device = atoi(optarg);

                if (device < 0) {

                    printf("Invalid device ID %d\n", device);

                    exit(1);

                }

                break;

 

            case 'n':

                niters = atoi(optarg);

                if (niters <= 0) {

                    printf("Invalid number of iterations %d\n", niters);

                    exit(1);

                }

                break;

 

            default:

                printf("Usage:\n\t%s -h -a <buffer alignment> -d <device ID> -n

<number of iterations>\n", argv[0]);

                exit(0);

        }

    }

}

 

static inline double get_cpu_time()

{

    struct timeval tv;

    if (gettimeofday(&tv, 0)) {

        printf("gettimeofday returned error\n");

        abort();

    }

    return tv.tv_sec + tv.tv_usec/1e6;

}

 

int main(int argc, char **argv)

{

    int     i, j;

    double  send;

    double  receive;

 

    parse_options(argc, argv);

 

    printf("Bandwidth test. Buffer alignment: %d. DeviceID: %d. Number of iterations: %d.\n\n",

           align, device, niters);

 

    printf("%20s %20s %20s\n",

            "Size(Bytes)", "Send(Bytes/sec)", "Receive(Bytes/sec)");

 

    for (i = 0; bufsizes[i] > 0; i++) {

        /* alloc CPU buffer */

        buf = (char*) _mm_malloc(bufsizes[i], align);

        if (buf == 0) {

            printf("Cannot not allocate buffer (%d bytes)\n", bufsizes[i]);

            abort();

        }

 

        /* alloc MIC buffer */

#pragma offload target(mic: device) \

                in(buf : length(bufsizes[i]) free_if(0))

        {}

 

        /* The main benchmarking loop */

        send = 0;

        receive = 0;

 

        for (j = 0; j < niters; j++) {

            double start;

 

            /* send */

            start = get_cpu_time();

#pragma offload target(mic: device) \

                in(buf : length(bufsizes[i]) alloc_if(0) free_if(0))

            {}

            send += get_cpu_time() - start;

 

            /* receive */

            start = get_cpu_time();

#pragma offload target(mic: device) \

                out(buf : length(bufsizes[i]) alloc_if(0) free_if(0))

            {}

            receive += get_cpu_time() - start;

        }

 

        send /= niters;

        receive /= niters;

 

        printf("%20d %20.2f %20.2f\n",

               bufsizes[i], bufsizes[i]/send, bufsizes[i]/receive);

 

        /* free MIC buffer */

#pragma offload target(mic: device) \

                out(buf : length(bufsizes[i]) alloc_if(0))

        {}

 

        /* free CPU buffer */

        _mm_free(buf);

    }

 

    return 0;

}

要点

本文介绍了如何使用 _mm_malloc() _mm_free()替代 malloc() free() 4K 边界上实现数据缓冲对齐。4K 边界是 DMA 传输的最理想选择。此外,本文还提供了代码来针对不同的缓冲大小测量传输速率。这可以帮助您根据您的数据确定最佳的缓存大小。

下一步

要在英特尔® 至强融核协处理器上成功调试您的应用请务必通读此指南并点击文中的超链接查看相关内容。本指南提供了实现最佳应用性能所要执行的步骤。

返回到“本机和卸载编程模式

 

Para obter mais informações sobre otimizações de compiladores, consulte Aviso sobre otimizações.