Hi,
Please find the QPI usage and the program below. To explain what the program does,
Allocates the memory by binding to a particular CPU.
Then launching threads and binding those to each of the CPU cores.
Each of these threads read data from the memory allocated initially and modifies the data.
My question is why is the QPI usage so low? When the memory is allocated on one of the CPU and accessed from the other CPU, there should be Intersocket QPI, isn't it?
Machine details:-
Processor: 2XL5518( ie 4 cores each * 2 CPUs * 2Hyperthreads = 16 Logical CPUs )
Memory: 96GB
Nehalem architecture
QPI Usage
---------------------------------------||---------------------------------------
Sampling Duration: 1000387.50 micro-seconds
--- Logical Processor 3 ---||--- Logical Processor 4 ---
---------------------------------------||---------------------------------------
--- Intersocket QPI Utilization ---||--- Intersocket QPI Utilization ---
---------------------------------------||---------------------------------------
--- Reads (MB/s): 45.06 ---||--- Reads (MB/s): 26.16 ---
--- Writes(MB/s): 5.06 ---||--- Writes(MB/s): 7.13 ---
---------------------------------------||---------------------------------------
--- Memory Performance Monitoring ---||--- Memory Performance Monitoring ---
---------------------------------------||---------------------------------------
--- Mem Ch 0: Reads (MB/s): 1906.66 ---||--- Mem Ch 0: Reads (MB/s): 1985.50 ---
--- Writes(MB/s): 2701.29 ---||--- Writes(MB/s): 2583.51 ---
--- Mem Ch 1: Reads (MB/s): 1906.79 ---||--- Mem Ch 1: Reads (MB/s): 1985.47 ---
--- Writes(MB/s): 2701.21 ---||--- Writes(MB/s): 2583.51 ---
--- Mem Ch 2: Reads (MB/s): 1906.74 ---||--- Mem Ch 2: Reads (MB/s): 1985.37 ---
--- Writes(MB/s): 2701.06 ---||--- Writes(MB/s): 2583.34 ---
--- ND0 Mem Read Traffic: 5720.19 ---||--- ND1 Mem Read Traffic: 5956.34 ---
--- ND0 Mem Write Traffic: 8103.56 ---||--- ND1 Mem Write Traffic: 7750.36 ---
--- ND0 Memory Throughput: 13823.75 ---||--- ND1 Memory Throughput: 13706.70 ---
---------------------------------------||---------------------------------------
--- System Read Throughput(MB/s): 11676.53 ---
--- System Write Throughput(MB/s): 15853.92 ---
--- System Memory Throughput(MB/s): 27530.44 ---
---------------------------------------||---------------------------------------
Program
#define _GNU_SOURCE
#include
#include
#include
#include
#include
#include
typedef struct
{
char* pDataIn;
char* pDataOut;
unsigned int size;
unsigned int cpuno;
} tInputData;
int bRun = 1;
void* exploitmemory( void* pParam )
{
int val = 0;
cpu_set_t cpuset, testset;
tInputData* pInp = (tInputData*) pParam;
CPU_ZERO( &cpuset );
CPU_SET( pInp->cpuno, &cpuset );
if( sched_setaffinity( 0, sizeof(cpu_set_t), &cpuset ))
{
printf( "CPU No: error\\n", pInp->cpuno );
perror( "sched_setaffinithy" );
return NULL;
}
printf( "Thread running on CPU:%d\\n", pInp->cpuno );
while( bRun )
{
sched_getaffinity( 0, sizeof(cpu_set_t), &testset );
int i = 0;
for(i=0 ; i<16;++i)
{
if( CPU_ISSET( i, &testset ))
{
fprintf( stderr, " %d ", i );
fflush(stderr);
}
}
memcpy( pInp->pDataOut, pInp->pDataIn, pInp->size );
memset( pInp->pDataIn, val, pInp->size );
}
printf( "\\nThread for CPU:%d exiting\\n", pInp->cpuno );
return NULL;
}
void* alloc_memory_on_cpu( unsigned int cpuno, unsigned int size )
{
cpu_set_t cpuset, testset;
CPU_ZERO( &cpuset );
CPU_SET( cpuno, &cpuset );
if( sched_setaffinity( 0, sizeof(cpu_set_t), &cpuset ))
{
printf( "CPU No: error\\n", cpuno );
perror( "sched_setaffinithy" );
return NULL;
}
sched_getaffinity( 0, sizeof(cpu_set_t), &testset );
int i = 0;
for(i=0 ; i<16;++i)
{
if( CPU_ISSET( i, &testset ))
{
printf( "%d CPU is allocated\\n", i );
}
}
void* data = malloc( size );
return data;
}
int main( int argc, char* argv[] )
{
cpu_set_t cpuset;
if( argc != 4 )
{
printf( "./a.out \\n" );
return 1;
}
int cpuno = atoi( argv[1] );
int memcpu = atoi(argv[2]);
int size = atoi(argv[3]);
int i;
tInputData data[16];
pthread_t threadId[16];
bRun = 1;
for( i = 0; i < cpuno; i++ )
{
data[i].pDataIn = (char*) alloc_memory_on_cpu( memcpu, size );
if( data[i].pDataIn == NULL )
{
return 1;
}
data[i].pDataOut = (char*) alloc_memory_on_cpu( memcpu, size );
if( data[i].pDataOut == NULL )
{
free( data[i].pDataIn );
return 1;
}
data[i].size = size;
data[i].cpuno = i;
pthread_create( &threadId[i], NULL, exploitmemory, (void*) &data[i] );
}
printf( "Enter q to exit test\\n" );
int breakch = 0;
while(breakch != 'q')
{
breakch = getchar();
}
bRun = 0;
for( i = 0 ; i < cpuno; i++ )
{
pthread_join( threadId[i], NULL );
}
for( i = 0; i < cpuno; i++ )
{
free( data[i].pDataIn );
free( data[i].pDataOut );
}
printf( "Main program exiting.\\n" );
return 0;
}
Executing the program :-
./a.out 16 0 524288000
Compiling the program :-
gcc -lpthread qpi_cpuaffinity.c
Regards,
Amal



