About Cache Blocking

About Cache Blocking

icicle's picture

This also similar with my previous question about bus transaction.
I implement cache blocking. Is it right?

system spec: H/W : IBM Xseries 225
OS : Redhat Linux 9
compiler : icc 8.0
From IA-32 Intel Architecture Optimization Chapter 7"
Cache Blocking Technique
Loop blocking is useful for reducing cache misses and improving
memory access performance. The selection of a suitable block size is
critical when applying the loop blocking technique. Loop blocking is
applicable to single-threaded applications as well as to multithreaded
applications running on processors with or without Hyper-Threading
Technology. The technique transforms the memory access pattern into
blocks that efficiently fit in the target cache size.
When targeting IA-32 processors with Hyper-Threading Technology,
the loop blocking technique should select a block size that is no more
than one half of the target cache size. The upper limit of the block size
for loop blocking should be determined by dividing the target cache size
by the number of logical processors available in a physical processor
package. Typically, some cache lines are needed to access data that are
not part of the source or destination buffers used in cache blocking, so
the block size can be chosen between one quarter to one half of the
target cache (see also, Chapter 3).
User/Source Coding Rule 30. (H impact, H generality) Use cache blocking
to improve locality of data access. Target one quarter to one half of the cache
size when targeting IA-32 processors with Hyper-Threading Technology.

Source for cache blocking
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
#include
#include
void CacheBlocking(uint* result, uint* array, uint ARRAY_SZ, uint BLOCK_SZ, uint
ITERATIONS);
void NonCacheBlocking(uint* result, uint* array, uint ARRAY_SZ, uint ITERATIONS);
void TimeNonCacheBlocking(uint* array, uint ARRAY_SZ, uint ITERATIONS);
void TimeCacheBlocking(uint* array, uint ARRAY_SZ, uint BLOCK_SZ, uint ITERATIONS);
void timersubb( struct timeval *, struct timeval *, struct timeval *);
int main(int argc, char* argv[])
{
int i, j;
uint ITERATIONS = 1000;
uint ARRAY_SZ = 4096000;
uint* array = (uint*) malloc(sizeof(uint) * ARRAY_SZ);

for(i = 0; i < ITERATIONS; i++)
for(j = 0; j < ARRAY_SZ; j++)
array[j] = 3;

printf("NO Cache Blocking
");
TimeNonCacheBlocking(array, ARRAY_SZ, ITERATIONS);

printf("

Single Threaded Cache Blocking
");
TimeCacheBlocking(array, ARRAY_SZ, 204800, ITERATIONS);
TimeCacheBlocking(array, ARRAY_SZ, 136534, ITERATIONS);
TimeCacheBlocking(array, ARRAY_SZ, 117029, ITERATIONS);
TimeCacheBlocking(array, ARRAY_SZ, 102400, ITERATIONS);
TimeCacheBlocking(array, ARRAY_SZ, 68267, ITERATIONS);
TimeCacheBlocking(array, ARRAY_SZ, 34134, ITERATIONS);
TimeCacheBlocking(array, ARRAY_SZ, 25600, ITERATIONS);
TimeCacheBlocking(array, ARRAY_SZ, 12800, ITERATIONS);
TimeCacheBlocking(array, ARRAY_SZ, 6400, ITERATIONS);
TimeCacheBlocking(array, ARRAY_SZ, 3200, ITERATIONS);
TimeCacheBlocking(array, ARRAY_SZ, 1600, ITERATIONS);
return 0;
}

void
CacheBlocking(uint* result, uint* array, uint ARRAY_SZ, uint BLOCK_SZ, uint
ITERATIONS)
{
uint sum =0;
uint index = 0, i, j;
for(index = 0; index < ARRAY_SZ;) {
uint* data = &array[index];
index += BLOCK_SZ;
if(index > ARRAY_SZ)
BLOCK_SZ = ARRAY_SZ - (index - BLOCK_SZ);
for(i=0; i< ITERATIONS; i++)
for(j=0; j < BLOCK_SZ; j++)
sum += data[j] + data[j] + ITERATIONS;
}
*result = sum;
}

void NonCacheBlocking(uint* result, uint* array, uint ARRAY_SZ, uint ITERATIONS)
{
uint sum =0;
uint i, j;

for(i=0; i< ITERATIONS; i++)
for(j=0; j < ARRAY_SZ; j++)
sum += array[j] + array[j] + ITERATIONS;
*result = sum;
}
void TimeCacheBlocking(uint* array, uint ARRAY_SZ, uint BLOCK_SZ, uint ITERATIONS)
{
uint sum = 0;
struct timeval start, end, result;
gettimeofday(&start, NULL);
CacheBlocking (&sum, array, ARRAY_SZ, BLOCK_SZ, ITERATIONS);
gettimeofday(&end, NULL);
timersubb(&start, &end, &result);
printf("%ld sec, %ld usec
", result.tv_sec, result.tv_usec);
printf("Block Size: %u K ", BLOCK_SZ * sizeof(uint) /1024);
printf("Results: %u
", sum);
}
void TimeNonCacheBlocking(uint* array, uint ARRAY_SZ, uint ITERATIONS)
{
uint sum = 0;
struct timeval start, end, result;
gettimeofday(&start, NULL);
NonCacheBlocking (&sum, array, ARRAY_SZ, ITERATIONS);
gettimeofday(&end, NULL);
timersubb(&start, &end, &result);
printf("%ld sec, %ld usec
", result.tv_sec, result.tv_usec);
printf("Block Size: 0 K ");
printf("Results: %u
", sum);
}

void timersubb( struct timeval *start, struct timeval *end, struct timeval *result)
{
result->tv_sec = end->tv_sec - start->tv_sec;
result->tv_usec = end->tv_usec - start->tv_usec;
if(result->tv_usec < 0) {
--result->tv_sec;
result->tv_usec += 1000000;
}
return;
}

2 posts / 0 new
Last post
For more complete information about compiler optimizations, see our Optimization Notice.
Clay Breshears (Intel)'s picture


Persepone -


I'm not familiar with the text that you are citing from the IA-32 Architecture manual, but your example looks to be correct. Are you not seeing any improvement on your system between the different blocking sizes that you're using?


-- clay

Login to leave a comment.