Performance Tools for Software Developers - Loop blocking

Loop blocking is a combination of strip mining and loop interchange to enhance reuse of local data. It helps the nested loops that manipulate arrays and are too large to fit into the cache. The loop blocking allows reuse of the arrays by transforming the loops such that the transformed loops manipulate array strips that fit into the cache. In effect, a blocked loop uses array elements in sections that are optimally sized to fit in the cache.

 

Use cache blocking to arrange a loop so it will perform as many computations as possible on data already residing in cache. (The next block of data is not read into cache until computations using the first block are finished.)

The loop blocking optimization is part of HLO phase in Intel compiler and is available when using compiler option -O3. The compiler uses default heuristics for loop blocking. But you may also use /Qopt-block-factor:n in Windows or -opt-block-factor:n in Linux to specify loop blocking factor.

Data reuse:

Data reuse is important to understand blocking. There are two types of data reuse associated with loop blocking:

  • Spatial reuse
  • Temporal reuse

 

Spatial reuse

Spatial reuse uses data that was encached as a result of fetching another piece of data from memory. The data is fetched one cache lines at a time. This is 64 bytes for Intel® Core2 processors. If the requested data is located at the beginning of the cache line (aligned data), and the rest of the cache line contains subsequent array elements then for float array, this means the requested element and the seven following elements are cached on each fetch after the first. If any of these seven elements could then be used on any subsequent iterations of the loop, the loop would be exploiting spatial reuse. For loops with strides greater than one, spatial reuse can still occur. However, the cache lines contain fewer usable elements.

Temporal reuse

Temporal reuse uses the same data item in more than one iteration of the loop. If the loop uses the same element in subsequent loop iterations then loop exhibits temporal reuse in the context of the loop. The blocking exploits spatial reuse by ensuring that once fetched, cache lines are not overwritten until their spatial reuse is exhausted.

Example 1: Simple Loop Blocking

The following example demonstrates the simple loop blocking. The loop blocking allows arrays A and B to be blocked into smaller rectangular chunks so that the total combined size of two blocked (A and B) chunks is smaller than cache size, which can improve data reuse.

// before_loopblocking.cpp

/*

* icl /Qoption,link,"/STACK:1000000000" before_loopblocking.cpp

*/

#include <time.h>

#include <stdio.h>

#define MAX 8000

void add(int a[][MAX], int b[][MAX]);

int main()

{

int i, j;

int A[MAX][MAX];

int B[MAX][MAX];

clock_t before, after;

//Initialize array

for(i=0;i<MAX;i++)

{

for(j=0;j<MAX; j++)

{

A[i][j]=j;

B[i][j]=j;

}

}

before = clock();

add(A, B);

add(A, B);

add(A, B);

add(A, B);

after = clock();

printf("\nTime taken to complete : %7.2lf secs\n", (float)(after - before)/ CLOCKS_PER_SEC); //List time taken to complete add function

}

void add(int a[][MAX], int b[][MAX])

{

int i, j;

for(i=0;i<MAX;i++)

{

for(j=0; j<MAX;j++)

{

a[i][j] = a[i][j] + b[j][i]; //Adds two matrices

}

}

}

The above code is modified below to enhance reuse of the cached data:

// after_loopblocking.cpp

/*

* icl /Qoption,link,"/STACK:1000000000" after_loopblocking.cpp

*/

#include <stdio.h>

#include <time.h>

#define MAX 8000

#define BS 16 //Block size is selected as the loop-blocking factor.

void add(int a[][MAX], int b[][MAX]);

int main()

{

int i, j;

int A[MAX][MAX];

int B[MAX][MAX];

clock_t before, after;

//Initialize array

for(i=0;i<MAX;i++)

{

for(j=0;j<MAX; j++)

{

A[i][j]=j;

B[i][j]=j;

}

}

before = clock();

add(A, B);

add(A, B);

add(A, B);

add(A, B);

after = clock();

printf("\nTime taken to complete : %7.2lf secs\n", (float)(after - before)/ CLOCKS_PER_SEC); //List time taken to complete add function

}

void add(int a[][MAX], int b[][MAX])

{

int i, j, ii, jj;

for(i=0;i<MAX;i+=BS)

{

for(j=0; j<MAX;j+=BS)

{

for(ii=i; ii<i+BS; ii++)//outer loop

{

for(jj=j; jj<j+BS; jj++)

{

//Array B experiences one cache miss

//for every iteration of outer loop

a[ii][jj] = a[ii][jj] + b[jj][ii]; }

}

}

}

}

Example 2: Complex Blocking

// matrixMul.cpp

/*

* icl /Qoption,link,"/STACK:1000000000" matrixMul.cpp

*/

#include <stdio.h>

#include <time.h>

#define MAX 800

void matmul(int c[][MAX], int a[][MAX], int b[][MAX]);

int main()

{

int i, j;

int A[MAX][MAX];

int B[MAX][MAX];

int C[MAX][MAX];

clock_t before, after;

//Initialize array

for(i=0;i<MAX;i++)

{

for(j=0;j<MAX; j++)

{

A[i][j]=j;

B[i][j]=j;

}

}

before = clock();

matmul(C, A, B);

after = clock();

printf("\nTime taken to complete : %7.2lf secs\n", (float)(after - before)/ CLOCKS_PER_SEC); //List time taken to complete add function

}

void matmul(int c[][MAX], int a[][MAX], int b[][MAX])

{

int i, j, k;

for(i=0;i<MAX;i++)

{

for(j=0; j<MAX;j++)

{

for(k=0; k < MAX; k++)

{

c[i][j] = c[i][j] + a[i][k] * b[k][j];

}

}

}

}

The above code is modified below to enhance spatial and temporal reuse of the cached data for array a, b and c:

// matrixMulBlk.cpp

/*

* icl /Qoption,link,"/STACK:1000000000" matrixMulBlk.cpp

*/

#include <stdio.h>

#include <time.h>

#define MAX 800

#define BS 16 //Block size is selected as the loop-blocking factor.

void matmul(int c[][MAX], int a[][MAX], int b[][MAX]);

int main()

{

int i, j;

int A[MAX][MAX];

int B[MAX][MAX];

int C[MAX][MAX];

clock_t before, after;

//Initialize array

for(i=0;i<MAX;i++)

{

for(j=0;j<MAX; j++)

{

A[i][j]=j;

B[i][j]=j;

}

}

before = clock();

matmul(C, A, B);

after = clock();

printf("\nTime taken to complete : %7.2lf secs\n", (float)(after - before)/ CLOCKS_PER_SEC); //List time taken to complete add function

}

void matmul(int c[][MAX], int a[][MAX], int b[][MAX])

{

int i, j, k, jj, kk;

for(j=0;j<MAX; j += BS)

{

for(k=0; k<MAX; k += BS)

{

for(i=0; i < MAX; i++)

{

for(kk=k; kk<k+BS; kk++)

{

for(jj=j; jj<j+BS; jj++)

{

c[i][jj] = (c[i][jj] + a[i][kk] * b[kk][jj]);

}

}

}

}

}

}

For more complete information about compiler optimizations, see our Optimization Notice.