# An example to show performance data for different implementations of Pi calculating

Here is a simple example to show you performance data for different implementations of calculating Pi. Yes, this is an extreme simple example but explained my previous post well (you can control threads easily by using traditional Windows* APIs programming, OpenMP* code is effective and TBB code is optimized in template – at least simply educate you how to use OpenMP* and TBB in your code.

I used Intel® C/C++ Compiler 11.066 which is already published.

[Source begin]

#include <windows.h>

#include <stdio.h>

#include <iostream>

#include <time.h>

#include <omp.h>

#include "tbb/parallel_for.h"

#include "tbb/blocked_range.h"

#include "tbb/spin_mutex.h"

#include "tbb/tick_count.h"

const int num_steps = 100000000;

const int num_threads = 4; // My laptop is T61

double step = 0.0, pi = 0.0;

static tbb::spin_mutex myMutex;

static CRITICAL_SECTION cs;

void Serial_Pi()

{

double x, sum = 0.0;

int i;

step = 1.0/(double) num_steps;

for (i=0; i< num_steps; i++){

x = (i+0.5)*step;

sum = sum + 4.0/(1.0 + x*x);

}

pi = step * sum;

}

{

double partialSum = 0.0, x;  // local to each thread

int myNum = *((int *)pArg);

step = 1.0/(double) num_steps;

for ( int i=myNum; i<num_steps; i+=num_threads )  // use every num_threads step

{

x = (i + 0.5)*step;

partialSum += 4.0 / (1.0 + x*x);  //compute partial sums at each thread

}

EnterCriticalSection(&cs);

pi += partialSum * step;  // add partial to global final answer

LeaveCriticalSection(&cs);

return 0;

}

{

InitializeCriticalSection(&cs);

step = 1.0 / num_steps;

for ( int i=0; i<num_threads; ++i )

{

tNum[i] = i;

0,               // Stack size

}

}

void OpenMP_Pi()

{

double x, sum=0.0;

int i;

step = 1.0 / (double)num_steps;

#pragma omp parallel for private (x) reduction(+:sum) //schedule(static,4)

for (i=0; i<num_steps; i++)

{

x = (i + 0.5)*step;

sum = sum + 4.0/(1. + x*x);

}

pi = sum*step;

}

class ParallelPi {

public:

void operator() (tbb::blocked_range<int>& range) const {

double x, sum = 0.0;

for (int i = range.begin(); i < range.end(); ++i) {

x = (i+0.5)*step;

sum = sum + 4.0/(1.0 + x*x);

}

tbb::spin_mutex::scoped_lock lock(myMutex);

pi += step * sum;

}

};

void TBB_Pi ()

{

step = 1.0/(double) num_steps;

parallel_for (tbb::blocked_range<int> (0, num_steps), ParallelPi(), tbb::auto_partitioner());

}

int main()

{

clock_t start, stop;

// Coputing pi by using serial code

pi = 0.0;

start = clock();

Serial_Pi();

stop = clock();

printf ("Computed value of Pi by using serial code: %12.9f\n", pi);

printf ("Elapsed time: %.2f seconds\n", (double)(stop-start)/1000.0);

// Computing pi by using Windows Threads

pi = 0.0;

start = clock();

stop = clock();

printf ("Computed value of Pi by using WinThreads: %12.9f\n", pi);

printf ("Elapsed time: %.2f seconds\n", (double)(stop-start)/1000.0);

// Computing pi by using OpenMP

pi = 0.0;

start = clock();

OpenMP_Pi();

stop = clock();

printf ("Computed value of Pi by using OpenMP: %12.9f\n", pi);

printf ("Elapsed time: %.2f seconds\n", (double)(stop-start)/1000.0);

// Computing pi by using TBB

pi = 0.0;

start = clock();

TBB_Pi();

stop = clock();

printf ("Computed value of Pi by using TBB: %12.9f\n", pi);

printf ("Elapsed time: %.2f seconds\n", (double)(stop-start)/1000.0);

return 0;

}

[End of source]

Here are results:

Computed value of Pi by using serial code:  3.141592654

Elapsed time: 0.78 seconds

Computed value of Pi by using WinThreads:  3.141592654

Elapsed time: 0.55 seconds

Computed value of Pi by using OpenMP:  3.141592654

Elapsed time: 0.42 seconds

Computed value of Pi by using TBB:  3.141592654

Elapsed time: 0.44 seconds

That is why I recommend to use OpenMP* or TBB instead of WinThreads

Para obtener información más completa sobre las optimizaciones del compilador, consulte nuestro Aviso de optimización.
Categorías: