Prefetch performance issue in Block-Matching algorithm for 1920x1080 resolution image

Prefetch performance issue in Block-Matching algorithm for 1920x1080 resolution image

Hi,
To improvement the realtime performance for one block matching algorithm for 8x8 block size,
I used the software data prefetch and hope to get low data cache miss (precisely get low cache read miss).
However the VTune shows that adding prefetch logic gets lower cache performance than the function without
software data cache code.
The block matching algorithm and software data prefetching logic is showed as following:
In one loop, accumlated the sum of absolute differences (SAD) between correspoinding pixels from two 8x8 blocks comes
froms previous frame (PF) and current frame (CF), and prefetch the next 8 rows by cache line size of PF and CF.

The test PC CPU is Intel Pentium M processor 1500MHz. It has 512 MB of RAM.

The L2 cache reads events and Misses results get from VTune are showed as following, and you can found the total VTune results
at end.
Process, L2 Cache Reads events, L2 Cache Read Misses (highly correlated)
ZeroMv_Fetch.exe, 20450000, 9105000
ZeroMv_noFetch.exe, 13215000, 6785000

I'm confused about two questions
1. According to the Intel IA-32 architectures optimization reference manual, the automatic hardware prefetching will not be
triggered because the two successive cache missed address distance greater than the threshold(512 or 256 bytes for Pentium M).
The image width is 1920. Is it right?
2. Since hardware prefetching is not be triggered, software data prefetch skill will be helpful to decrease L2 cache read miss.
However the results are different. Why?

//SAD-based block matching (0 MV)
// ARG:nRow, image rows. It is 1080 for test.
// nCol, image column. It is 1920 for test.
// aucImgCF, current frame image, nImgH by nImgW size. Unsigned char.
// aucImgP1, previous frame image, nImgH by nImgW size. Unsigned char.
// pcCFFetchPix, pointed the current frame image for prefetch. Unsigned char *.
// pcP1FetchPix, pointed the previous frame image for prefetch. Unsigned char *.
// pcCFPix, pointed the current frame image block to calculate SAD. Unsigned char *.
// pcP1Pix, pointed the previous frame image block to calculate SAD. Unsigned char *.
// pwZeroSAD, pointed to a nRow/8 by nCol/8 size SAD array. Unsigned short *.

#define CACHE_LINE_SIZE (64) //L2 cache size for Intel Pentium M processor
#define DATA_ACCESS_PREFETCH(addr)
{_mm_prefetch((const char *)(addr), _MM_HINT_T2); }

for (nRow=0; nRow {
for (nCol=0; nCol< (nImgW - CACHE_LINE_SIZE); nCol=nCol+CACHE_LINE_SIZE )
{
//For prefetch
nFetchColIdx = nCol + CACHE_LINE_SIZE;
pcCFFetchPix = &aucImgCF[nRow*nImgW + nFetchColIdx];
pcP1FetchPix = &aucImgP1[nRow*nImgW + nFetchColIdx];
//It needs prefetch 8 lines

for (i=0; i {
//Since L2 data cache line size (CACHE_LINE_SIZE) is 64,
//so there are 8 iterations
DATA_ACCESS_PREFETCH(pcCFFetchPix); pcCFFetchPix += nImgW;
DATA_ACCESS_PREFETCH(pcP1FetchPix); pcP1FetchPix += nImgW;

//Get left-top pixel of P1 block
nP1ImgCol = nCol+i;
pcP1Pix = pcP1Block = &aucImgP1[nRow*nImgW + nP1ImgCol];

//Get left-top pixel of CF block
nCFImgCol = nCol+i;
pcCFPix = pcCFBlock = &aucImgCF[nRow*nImgW + nCFImgCol];

__m128i s0, s1, s2, s3, s4, s5, s6, s7;
s0 = _mm_loadu_si128((__m128i*)pcCFPix); pcCFPix = pcCFPix + nImgW;
s1 = _mm_loadu_si128((__m128i*)pcCFPix); pcCFPix = pcCFPix + nImgW;
s2 = _mm_loadu_si128((__m128i*)pcCFPix); pcCFPix = pcCFPix + nImgW;
s3 = _mm_loadu_si128((__m128i*)pcCFPix); pcCFPix = pcCFPix + nImgW;

s4 = _mm_loadu_si128((__m128i*)pcCFPix); pcCFPix = pcCFPix + nImgW;
s5 = _mm_loadu_si128((__m128i*)pcCFPix); pcCFPix = pcCFPix + nImgW;
s6 = _mm_loadu_si128((__m128i*)pcCFPix); pcCFPix = pcCFPix + nImgW;
s7 = _mm_loadu_si128((__m128i*)pcCFPix); pcCFPix = pcCFPix + nImgW;

s0= _mm_unpacklo_epi64(s0, s1);
s2= _mm_unpacklo_epi64(s2, s3);
s4= _mm_unpacklo_epi64(s4, s5);
s6= _mm_unpacklo_epi64(s6, s7);

__m128i s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7;
s0_0 = _mm_loadu_si128((__m128i*)pcP1Pix); pcP1Pix = pcP1Pix + nImgW;
s0_1 = _mm_loadu_si128((__m128i*)pcP1Pix); pcP1Pix = pcP1Pix + nImgW;
s0_2 = _mm_loadu_si128((__m128i*)pcP1Pix); pcP1Pix = pcP1Pix + nImgW;
s0_3 = _mm_loadu_si128((__m128i*)pcP1Pix); pcP1Pix = pcP1Pix + nImgW;

s0_4 = _mm_loadu_si128((__m128i*)pcP1Pix); pcP1Pix = pcP1Pix + nImgW;
s0_5 = _mm_loadu_si128((__m128i*)pcP1Pix); pcP1Pix = pcP1Pix + nImgW;
s0_6 = _mm_loadu_si128((__m128i*)pcP1Pix); pcP1Pix = pcP1Pix + nImgW;
s0_7 = _mm_loadu_si128((__m128i*)pcP1Pix); pcP1Pix = pcP1Pix + nImgW;

s0_0= _mm_unpacklo_epi64(s0_0, s0_1);
s0_2= _mm_unpacklo_epi64(s0_2, s0_3);
s0_4= _mm_unpacklo_epi64(s0_4, s0_5);
s0_6= _mm_unpacklo_epi64(s0_6, s0_7);

nSAD = _mm_extract_epi16(_mm_adds_epu16(_mm_adds_epu16(_mm_sad_epu
8(s0, s0_0), _mm_sad_epu8(s2, s0_2)),_mm_adds_epu16(_mm_sad_epu8(s4, s0_4), _mm_sad_epu8(s6, s0_6))),0);

*pwZeroSAD = nSAD;
pwZeroSAD++;

}
}

for (; nCol< nImgW; nCol=nCol+CACHE_LINE_SIZE )
{
for (i=0; i {
//Get left-top pixel of P1 block
nP1ImgCol = nCol+i;
pcP1Pix = pcP1Block = &aucImgP1[nRow*nImgW + nP1ImgCol];

//Get left-top pixel of CF block
nCFImgCol = nCol+i;
pcCFPix = pcCFBlock = &aucImgCF[nRow*nImgW + nCFImgCol];

__m128i s0, s1, s2, s3, s4, s5, s6, s7;
s0 = _mm_loadu_si128((__m128i*)pcCFPix); pcCFPix = pcCFPix + nImgW;
s1 = _mm_loadu_si128((__m128i*)pcCFPix); pcCFPix = pcCFPix + nImgW;
s2 = _mm_loadu_si128((__m128i*)pcCFPix); pcCFPix = pcCFPix + nImgW;
s3 = _mm_loadu_si128((__m128i*)pcCFPix); pcCFPix = pcCFPix + nImgW;

s4 = _mm_loadu_si128((__m128i*)pcCFPix); pcCFPix = pcCFPix + nImgW;
s5 = _mm_loadu_si128((__m128i*)pcCFPix); pcCFPix = pcCFPix + nImgW;
s6 = _mm_loadu_si128((__m128i*)pcCFPix); pcCFPix = pcCFPix + nImgW;
s7 = _mm_loadu_si128((__m128i*)pcCFPix); pcCFPix = pcCFPix + nImgW;

s0= _mm_unpacklo_epi64(s0, s1);
s2= _mm_unpacklo_epi64(s2, s3);
s4= _mm_unpacklo_epi64(s4, s5);
s6= _mm_unpacklo_epi64(s6, s7);

__m128i s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7;
s0_0 = _mm_loadu_si128((__m128i*)pcP1Pix); pcP1Pix = pcP1Pix + nImgW;
s0_1 = _mm_loadu_si128((__m128i*)pcP1Pix); pcP1Pix = pcP1Pix + nImgW;
s0_2 = _mm_loadu_si128((__m128i*)pcP1Pix); pcP1Pix = pcP1Pix + nImgW;
s0_3 = _mm_loadu_si128((__m128i*)pcP1Pix); pcP1Pix = pcP1Pix + nImgW;

s0_4 = _mm_loadu_si128((__m128i*)pcP1Pix); pcP1Pix = pcP1Pix + nImgW;
s0_5 = _mm_loadu_si128((__m128i*)pcP1Pix); pcP1Pix = pcP1Pix + nImgW;
s0_6 = _mm_loadu_si128((__m128i*)pcP1Pix); pcP1Pix = pcP1Pix + nImgW;
s0_7 = _mm_loadu_si128((__m128i*)pcP1Pix); pcP1Pix = pcP1Pix + nImgW;

s0_0= _mm_unpacklo_epi64(s0_0, s0_1);
s0_2= _mm_unpacklo_epi64(s0_2, s0_3);
s0_4= _mm_unpacklo_epi64(s0_4, s0_5);
s0_6= _mm_unpacklo_epi64(s0_6, s0_7);

nSAD = _mm_extract_epi16(_mm_adds_epu16(_mm_adds_epu16(_mm_sad_epu8(s0, s0_0), _mm_sad_epu8(s2, s0_2)),_mm_adds_epu16(_mm_sad_epu8(s4, s0_4), _mm_sad_epu8(s6, s0_6))),0);

*pwZeroSAD = nSAD;
pwZeroSAD++;
}
}
}

Process,L2 Cache Reads samples,L2 Cache Read Misses (highly correlated) samples,DTLB misses samples,Instructions Retired samples,Clockticks samples,Cycles per Retired Instruction - CPI,DTLB Miss Rate,L2 Cache Reads %,L2 Cache Read Misses (highly correlated) %,D
TLB misses %,Instructions Retired %,Clockticks %,L2 Cache Reads events,L2 Cache Read Misses (highly correlated) events,DTLB misses events,Instructions Retired events,Clockticks events,Process Path,Process ID
ZeroMv_Fetch.exe,4090,1821,113,639,3418,5.349,0,96.78%,98.54%,68.48%,86.00%,95.05%,20450000,9105000,565000,383400000,2050800000,DeviceHarddiskVolume2IntelCPUeroMvSADRelease,2876

Process,L2 Cache Reads samples,L2 Cache Read Misses (highly correlated) samples,DTLB misses samples,Instructions Retired samples,Clockticks samples,Cycles per Retired Instruction - CPI,DTLB Miss Rate,L2 Cache Reads %,L2 Cache Read Misses (highly correlated) %,DTLB misses %,Instructions Retired %,Clockticks %,L2 Cache Reads events,L2 Cache Read Misses (highly correlated) events,DTLB misses events,Instructions Retired events,Clockticks events,Process Path,Process ID
ZeroMv_noFetch.exe,2643,1357,113,567,3581,6.316,0,95.66%,97.77%,69.75%,85.14%,95.39%,13215000,6785000,565000,340200000,2148600000,DeviceHarddiskVolume2IntelCPUeroMvSADRelease,3352

1 envío / 0 nuevos
Para obtener más información sobre las optimizaciones del compilador, consulte el aviso sobre la optimización.