Fast implemention of popcount on Xeon Phi

Fast implemention of popcount on Xeon Phi

I'm implementing an ultra fast popcount on Intel Xeon® Phi®, as it's a performance hotspot of various bioinformatics software.

I've implemented five pieces of codes,

#if defined(__MIC__)
 #include <zmmintrin.h>
 __attribute__((align(64))) static const uint32_t POPCOUNT_4bit[16] = {0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4};
 __attribute__((align(64))) static const uint32_t MASK_4bit[16] = {0xF, 0xF, 0xF, 0xF, 0xF, 0xF, 0xF, 0xF, 0xF, 0xF, 0xF, 0xF, 0xF, 0xF, 0xF, 0xF};
 inline uint64_t vpu_popcount1(uint64_t* buf, size_t n) {
    register size_t result = 0;
    size_t i;
     register const __m512i popcnt = _mm512_load_epi32((void*)POPCOUNT_4bit);
     register const __m512i mask = _mm512_load_epi32((void*)MASK_4bit);
     register __m512i total;
     register __m512i shuf;
     #pragma unroll(8)
     for (i = 0; i < n; i+=8) {
         shuf = _mm512_load_epi32(&buf[i]);
         _mm_prefetch((const char *)&buf[i+256], _MM_HINT_T1); // vprefetch1
         _mm_prefetch((const char *)&buf[i+64], _MM_HINT_T0); // vprefetch0
         total = _mm512_setzero_epi32();
         total = _mm512_add_epi32(_mm512_permutevar_epi32(_mm512_and_epi32(shuf, mask), popcnt), total);
         total = _mm512_add_epi32(_mm512_permutevar_epi32(_mm512_and_epi32(_mm512_srli_epi32(shuf, 4), mask), popcnt), total);
         total = _mm512_add_epi32(_mm512_permutevar_epi32(_mm512_and_epi32(_mm512_srli_epi32(shuf, 8), mask), popcnt), total);
         total = _mm512_add_epi32(_mm512_permutevar_epi32(_mm512_and_epi32(_mm512_srli_epi32(shuf, 12), mask), popcnt), total);
         total = _mm512_add_epi32(_mm512_permutevar_epi32(_mm512_and_epi32(_mm512_srli_epi32(shuf, 16), mask), popcnt), total);
         total = _mm512_add_epi32(_mm512_permutevar_epi32(_mm512_and_epi32(_mm512_srli_epi32(shuf, 20), mask), popcnt), total);
         total = _mm512_add_epi32(_mm512_permutevar_epi32(_mm512_and_epi32(_mm512_srli_epi32(shuf, 24), mask), popcnt), total);
         total = _mm512_add_epi32(_mm512_permutevar_epi32(_mm512_and_epi32(_mm512_srli_epi32(shuf, 28), mask), popcnt), total);
         /* Reduce add, which is analogous to SSSE3's PSADBW instruction,
         is not implementated as a single instruction in VPUv1, thus
         emulated by multiple instructions*/
         result += _mm512_reduce_add_epi32(total);
     }
     return result;
 }
 __attribute__((align(64))) static const unsigned magic[] = {
 0x55555555, 0x55555555, 0x55555555, 0x55555555,
 0x55555555, 0x55555555, 0x55555555, 0x55555555,
 0x55555555, 0x55555555, 0x55555555, 0x55555555,
 0x55555555, 0x55555555, 0x55555555, 0x55555555,
 0x33333333, 0x33333333, 0x33333333, 0x33333333,
 0x33333333, 0x33333333, 0x33333333, 0x33333333,
 0x33333333, 0x33333333, 0x33333333, 0x33333333,
 0x33333333, 0x33333333, 0x33333333, 0x33333333,
 0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F,
 0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F,
 0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F,
 0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F,
 0x00FF00FF, 0x00FF00FF, 0x00FF00FF, 0x00FF00FF,
 0x00FF00FF, 0x00FF00FF, 0x00FF00FF, 0x00FF00FF,
 0x00FF00FF, 0x00FF00FF, 0x00FF00FF, 0x00FF00FF,
 0x00FF00FF, 0x00FF00FF, 0x00FF00FF, 0x00FF00FF,
 0x0000FFFF, 0x0000FFFF, 0x0000FFFF, 0x0000FFFF,
 0x0000FFFF, 0x0000FFFF, 0x0000FFFF, 0x0000FFFF,
 0x0000FFFF, 0x0000FFFF, 0x0000FFFF, 0x0000FFFF,
 0x0000FFFF, 0x0000FFFF, 0x0000FFFF, 0x0000FFFF
 };
 inline uint64_t vpu_popcount2(uint64_t* buf, size_t n) {
     register size_t result = 0;
     size_t i;
     register const __m512i B0 = _mm512_load_epi32((void*)(magic+0));
     register const __m512i B1 = _mm512_load_epi32((void*)(magic+16));
     register const __m512i B2 = _mm512_load_epi32((void*)(magic+32));
     register const __m512i B3 = _mm512_load_epi32((void*)(magic+48));
     register const __m512i B4 = _mm512_load_epi32((void*)(magic+64));
     register __m512i total;
     register __m512i shuf;
     #pragma unroll(8)
     for (i = 0; i < n; i+=8) {
         shuf = _mm512_load_epi32(&buf[i]);
         _mm_prefetch((const char *)&buf[i+512], _MM_HINT_T1); // vprefetch1
         _mm_prefetch((const char *)&buf[i+64], _MM_HINT_T0); // vprefetch0
         total = _mm512_sub_epi32(shuf, _mm512_and_epi32(B0, _mm512_srli_epi32(shuf,1)));
         total = _mm512_add_epi32(_mm512_and_epi32(B1, total), _mm512_and_epi32(B1,_mm512_srli_epi32(total,2)));
         total = _mm512_and_epi32(B2, _mm512_add_epi32(total, _mm512_srli_epi32(total,4)));
         total = _mm512_and_epi32(B3, _mm512_add_epi32(total, _mm512_srli_epi32(total,8)));
         total = _mm512_and_epi32(B4, _mm512_add_epi32(total, _mm512_srli_epi32(total,16)));
         /* Reduce add, which is analogous to SSSE3's PSADBW instruction,
         is not implementated as a single instruction in VPUv1, thus
         emulated by multiple instructions*/
         result += _mm512_reduce_add_epi32(total);
     }
     return result;
 }
 inline uint64_t vpu_popcount3(uint64_t* buf, size_t n) {
     register size_t result = 0;
     size_t i;
     register const __m512i B0 = _mm512_load_epi32((void*)(magic+0));
     register const __m512i B1 = _mm512_load_epi32((void*)(magic+16));
     register const __m512i B2 = _mm512_load_epi32((void*)(magic+32));
     register const __m512i B3 = _mm512_load_epi32((void*)(magic+48));
     register const __m512i B4 = _mm512_load_epi32((void*)(magic+64));
     register __m512i total;
     register __m512i shuf;
     #pragma unroll(4)
     for (i = 0; i < n; i+=16) {
         shuf = _mm512_load_epi32(&buf[i]);
         result += _mm_countbits_64(buf[i+8]);
         _mm_prefetch((const char *)&buf[i+512], _MM_HINT_T1); // vprefetch1
         _mm_prefetch((const char *)&buf[i+576], _MM_HINT_T1); // vprefetch1
         result += _mm_countbits_64(buf[i+9]);
         _mm_prefetch((const char *)&buf[i+64], _MM_HINT_T0); // vprefetch0
         _mm_prefetch((const char *)&buf[i+128], _MM_HINT_T0); // vprefetch0
         total = _mm512_sub_epi32(shuf, _mm512_and_epi32(B0, _mm512_srli_epi32(shuf,1)));
         result += _mm_countbits_64(buf[i+10]);
         total = _mm512_add_epi32(_mm512_and_epi32(B1, total), _mm512_and_epi32(B1,_mm512_srli_epi32(total,2)));
         result += _mm_countbits_64(buf[i+11]);
         total = _mm512_and_epi32(B2, _mm512_add_epi32(total, _mm512_srli_epi32(total,4)));
         result += _mm_countbits_64(buf[i+12]);
         total = _mm512_and_epi32(B3, _mm512_add_epi32(total, _mm512_srli_epi32(total,8)));
         result += _mm_countbits_64(buf[i+13]);
         total = _mm512_and_epi32(B4, _mm512_add_epi32(total, _mm512_srli_epi32(total,16)));
         result += _mm_countbits_64(buf[i+14]);
         /* Reduce add, which is analogous to SSSE3's PSADBW instruction,
         is not implementated as a single instruction in VPUv1, thus
         emulated by multiple instructions*/
         result += _mm512_reduce_add_epi32(total);
         result += _mm_countbits_64(buf[i+15]);
     }
     
     return result;
 }
 /* Using VPU or SSE's machine intrinsic, CPUs not supporting SIMD 
 * will use compiler's implementation, the speed of which depends */
 static inline size_t scalar_popcountu(unsigned *buf, size_t n) {
     register size_t cnt = 0;
     size_t i;
     #pragma vector always
     #pragma unroll(8)
     for (i = 0; i < n; i++) {
         cnt += _mm_countbits_32(buf[i]);
         _mm_prefetch((const char *)&buf[i+512], _MM_HINT_T1); // vprefetch1
         _mm_prefetch((const char *)&buf[i+64], _MM_HINT_T0); // vprefetch0
     }
     return cnt;
 }
 static inline size_t scalar_popcountlu(uint64_t *buf, size_t n) {
     register size_t cnt = 0;
     size_t i;
     #pragma vector always
     #pragma unroll(8)
     for (i = 0; i < n; i++) {
         cnt += _mm_countbits_64(buf[i]);
         _mm_prefetch((const char *)&buf[i+512], _MM_HINT_T1); // vprefetch1
         _mm_prefetch((const char *)&buf[i+64], _MM_HINT_T0); // vprefetch0
     }
     return cnt;
 }
 #endif

A wrap up of the codes with OpenMP support can be downloaded from <https://www.dropbox.com/sh/b3sfqps19wa2oi4/iFQ9wQ1NTg>

The codes were compiled using Intel C/C++ Compiler XE 13 using command:

icc -debug inline-debug-info -O3 -mmic -fno-alias -ansi-alias -opt-streaming-stores always -ipo popcnt-mmic.cpp -o popcnt-mmic -vec-report=2 -openmp

The codes run natively on the co-processor (61 cores) with "122 threads" and thread affinity as "balanced" using exports:

export OMP_NUM_THREADS=122;export KMP_AFFINITY=balanced

Testing on 28 megabytes of junks (filled by rand()) and iterate for 10000 times, the performance are as follows:

Buffer allocated at: 0x7f456b000000
OpenMP scalar_popcountu 4310169 us; cnt = 28439328
OpenMP scalar_popcountlu 1421139 us; cnt = 28439328
OpenMP vpu_popcount 1489992 us; cnt = 28439328
OpenMP vpu_popcount2 1109530 us; cnt = 28439328
OpenMP vpu_popcount3 951122 us; cnt = 28439328

The "scalar_popcountu" and "scalar_popcountlu" use "_mm_countbits_32" and "_mm_countbits_64" intrinsics respectively, which utilize the scalar "popcnt" instruction. Setting "#pragma vector always" asks to compiler to vectorize the load and sum as 16 unsigned ints or 8 unsigned longs a time, although the popcount itself is still a scalar instruction.

The implementation of vpu_popcount1 is analogous to the SSSE3 popcount implementation <http://wm.ite.pl/articles/sse-popcount.html>. However, 1) Xeon Phi does not support packed byte operations on integer (the minimum is double words, aka 32-bit) and 2) it does not implement the "Packed sum of absolute difference" instruction (like _mm_sad_epu8 in SSSE3), thus the reduction add was performed by a combination of four groups of "vpermf32x4", "vpaddd" and "movslq". Thus the implementation generated much more instructions than the original SSSE3 version.

The implementation of vpu_popcount2 is analogous to the SSE2 popcount implementation (one can refer to "Hacker's Delight"). The implementation generates fewer instructions than vpu_popcount1 and is about 30% faster. However, the tedious "reduce add" still cannot be avoided.

The implementation of vpu_popcount3 is very Xeon Phi specific. With the mixture of vector and scalar operations, it's about 15% faster than vpu_popcount2 (the intersperse of scalar operations amid vector operations is leisure in my implementation, one can rearrange the scalar operations according to assembly codes generated by the compiler, but the expected improvement is limited as far as i'm concerned). The improvement is base on the observation that 1) Xeon Phi is in-order scheduling, 2) two scalar instructions or "1 vector+ 1 scalar" instructions can be issued per clock cycle. I've decrease the unroll from 8 to 4 to avoid register file saturation.

The explicit prefetch from memory to L2 8 loops in advance and from L2 to L1 1 loops in advance in each function has increased the L1 Hit Ratio from 0.38 to 0.994.

The unroll do increase the performance by about 15%. This is counter intuitive since Xeon Phi is in-order scheduling. But unroll enables the icc compiler to do as much compile time scheduling as possible.

Do we have even more technics to boost the performance?

AnexoTamanho
Download popcnt-mmic.cpp14.44 KB
Bioinformatician
1 post / novo 0
Para obter mais informações sobre otimizações de compiladores, consulte Aviso sobre otimizações.