# An example to show performance data for different implementations of Pi calculating

Here is a simple example to show you performance data for different implementations of calculating Pi. Yes, this is an extreme simple example but explained my previous post well (you can control threads easily by using traditional Windows* APIs programming, OpenMP* code is effective and TBB code is optimized in template – at least simply educate you how to use OpenMP* and TBB in your code.

I used Intel® C/C++ Compiler 11.066 which is already published.

[Source begin]

#include<windows.h>

#include<stdio.h>

#include<iostream>

#include<time.h>

#include<omp.h>

#include"tbb/parallel_for.h"

#include"tbb/blocked_range.h"

#include"tbb/spin_mutex.h"

#include"tbb/tick_count.h"

constint num_steps = 100000000;

constint num_threads = 4; // My laptop is T61

double step = 0.0, pi = 0.0;

static tbb::spin_mutex myMutex;

static CRITICAL_SECTION cs;

void Serial_Pi()

{

double x, sum = 0.0;

int i;

step = 1.0/(double) num_steps;

for (i=0; i< num_steps; i++){

x = (i+0.5)*step;

sum = sum + 4.0/(1.0 + x*x);

}

pi = step * sum;

}

{

double partialSum = 0.0, x;  // local to each thread

int myNum = *((int *)pArg);

step = 1.0/(double) num_steps;

for ( int i=myNum; i<num_steps; i+=num_threads )  // use every num_threads step

{

x = (i + 0.5)*step;

partialSum += 4.0 / (1.0 + x*x);  //compute partial sums at each thread

}

EnterCriticalSection(&cs);

pi += partialSum * step;  // add partial to global final answer

LeaveCriticalSection(&cs);

return 0;

}

{

InitializeCriticalSection(&cs);

step = 1.0 / num_steps;

for ( int i=0; i<num_threads; ++i )

{

tNum[i] = i;

0,               // Stack size

}

}

void OpenMP_Pi()

{

double x, sum=0.0;

int i;

step = 1.0 / (double)num_steps;

#pragma omp parallel forprivate (x) reduction(+:sum) //schedule(static,4)

for (i=0; i<num_steps; i++)

{

x = (i + 0.5)*step;

sum = sum + 4.0/(1. + x*x);

}

pi = sum*step;

}

class ParallelPi {

public:

voidoperator() (tbb::blocked_range<int>& range) const {

double x, sum = 0.0;

for (int i = range.begin(); i < range.end(); ++i) {

x = (i+0.5)*step;

sum = sum + 4.0/(1.0 + x*x);

}

tbb::spin_mutex::scoped_lock lock(myMutex);

pi += step * sum;

}

};

void TBB_Pi ()

{

step = 1.0/(double) num_steps;

parallel_for (tbb::blocked_range<int> (0, num_steps), ParallelPi(), tbb::auto_partitioner());

}

int main()

{

clock_t start, stop;

// Coputing pi by using serial code

pi = 0.0;

start = clock();

Serial_Pi();

stop = clock();

printf ("Computed value of Pi by using serial code: %12.9f\n", pi);

printf ("Elapsed time: %.2f seconds\n", (double)(stop-start)/1000.0);

// Computing pi by using Windows Threads

pi = 0.0;

start = clock();

stop = clock();

printf ("Computed value of Pi by using WinThreads: %12.9f\n", pi);

printf ("Elapsed time: %.2f seconds\n", (double)(stop-start)/1000.0);

// Computing pi by using OpenMP

pi = 0.0;

start = clock();

OpenMP_Pi();

stop = clock();

printf ("Computed value of Pi by using OpenMP: %12.9f\n", pi);

printf ("Elapsed time: %.2f seconds\n", (double)(stop-start)/1000.0);

// Computing pi by using TBB

pi = 0.0;

start = clock();

TBB_Pi();

stop = clock();

printf ("Computed value of Pi by using TBB: %12.9f\n", pi);

printf ("Elapsed time: %.2f seconds\n", (double)(stop-start)/1000.0);

return 0;

}

[End of source]

Here are results:

Computed value of Pi by using serial code:  3.141592654

Elapsed time: 0.78 seconds

Computed value of Pi by using WinThreads:  3.141592654

Elapsed time: 0.55 seconds

Computed value of Pi by using OpenMP:  3.141592654

Elapsed time: 0.42 seconds

Computed value of Pi by using TBB:  3.141592654

Elapsed time: 0.44 seconds

That is why I recommend to use OpenMP* or TBB instead of WinThreads

Для получения подробной информации о возможностях оптимизации компилятора обратитесь к нашему Уведомлению об оптимизации.
Возможность комментирования русскоязычного контента была отключена. Узнать подробнее.