Use __itt_sync_ APIs from Intel VTune Amplifier XE 2011 to capture user defined synchronization primitives

We know that "Locks and Waits" Analysis can provide user information on time spends in synchronization objects, and observes performance suffer when Waits occur while CPU is under-utilized. Objects usually use system APIs and I/O operations, such as Mutex, Semaphore, Stream, etc.

If some developers use their own defined objects, which are not system sync primitives, they can't be detected by VTune™ Amplifier XE 2011. The developers should add below APIs code into their source:

__itt_sync_create

__itt_sync_destroy

__itt_sync_prepare

__itt_sync_acquired

__itt_sync_releasing

 

Using include file and library file is described in the article < Use new Pause and Resume API from Intel VTune Amplifier XE 2011>

 

Here is an example code of using __itt_sync APIs for reference:

#include <stdio.h>

#include <stdlib.h>

#include <math.h>

#include <windows.h>

#include <ittnotify.h>

 

// Set number of particles and iterations low for Thread Checker

 

#define NPARTS 10

#define NITER 100

#define DIMS 3

#define NUM_THREADS 16

 

//int rand( void );

DWORD WINAPI computePot(LPVOID);

void initPositions(void);

void updatePositions(void);

 

double r[DIMS][NPARTS];

int    bounds[2][NUM_THREADS];

double pot;

 

// User-defined lock

int    spin=0;

 

CRITICAL_SECTION cs;  

 

int main() {

   int i, j;

   HANDLE tHandle[NUM_THREADS];

   int    tNum[NUM_THREADS];

 

   for (i=0; i<NUM_THREADS; i++) {

      bounds[0][i] =    i  * (NPARTS/NUM_THREADS);

      bounds[1][i] = (i+1) * (NPARTS/NUM_THREADS);

   }

   bounds[1][NUM_THREADS-1] = NPARTS;

 

   initPositions();

   updatePositions();

 

   InitializeCriticalSection(&cs);     

 

   // Create user defined sync object

   __itt_sync_create(&spin, NULL, "spin",  __itt_attr_mutex);

 

   for( i=0; i<NITER; i++ ) {

      pot = 0.0;

      for (j=0; j<NUM_THREADS; j++) {

         tNum[j] = j;

         tHandle[j] = CreateThread(NULL, 0, computePot, &tNum[j], 0, NULL);

      }

      WaitForMultipleObjects(NUM_THREADS, tHandle, TRUE, INFINITE);

 

      if (i%10 == 0) printf("%5d: Potential: %10.3f\n", i, pot);

      updatePositions();

   }

 

   // Destroy the sync object

   __itt_sync_destroy(&spin);

}

 

 

void initPositions() {

   int i, j;

 

   for( i=0; i<DIMS; i++ )

      for( j=0; j<NPARTS; j++ )

         r[i][j] = 0.5 + ( (double) rand() / (double) RAND_MAX );

}

 

 

void updatePositions() {

   int i, j;

 

   for( i=0; i<DIMS; i++ )

      for( j=0; j<NPARTS; j++ )

         r[i][j] -= 0.5 + ( (double) rand() / (double) RAND_MAX );

}

 

 

DWORD WINAPI computePot(LPVOID pArg) {

   int i, j, start, end, tid;

  

   double distx, disty, distz, dist;  // work variables should be local

   double lpot = 0.0;  // use local variable to collect partial results

 

   tid = *(int *)pArg;

   start = bounds[0][tid];

   end   = bounds[1][tid];

 

   for( i=start; i<end; i++ ) {

      for( j=0; j<i-1; j++ ) {

        distx = pow( (r[0][j] - r[0][i]), 2 );

        disty = pow( (r[1][j] - r[1][i]), 2 );

        distz = pow( (r[2][j] - r[2][i]), 2 );

        dist = sqrt( distx + disty + distz );

        

        lpot += 1.0 / dist;

      }

   }

 

/*

 * Old use Windows sync object

 

   EnterCriticalSection(&cs);

      pot += lpot;

   LeaveCriticalSection(&cs);

 *

 */

   __itt_sync_prepare (&spin);

 

   do {
      my_spin = _InterlockedCompareExchange (&spin, 1, 0);
   } while (my_spin == 1);
   
     __itt_sync_acquired (&spin);
   pot += lpot;
   __itt_sync_releasing (&spin);

   _InterlockedDecrement (&spin);


}

 

After using Locks and Waits analysis, you can see User Synchronization object named "spin" showing in report with less impact on performance:

itt_sync.jpg

For more complete information about compiler optimizations, see our Optimization Notice.