An example to show performance data for different implementations of Pi calculating

Last time I posted the topic – "Compare Windows* threads, OpenMP*, Intel® Threading Building Blocks for parallel programming" </en-us/blogs/2008/12/16/compare-windows-threads-openmp-intel-threading-building-blocks-for-parallel-programming>, and listed their advantages and disadvantages.

 

Here is a simple example to show you performance data for different implementations of calculating Pi. Yes, this is an extreme simple example but explained my previous post well (you can control threads easily by using traditional Windows* APIs programming, OpenMP* code is effective and TBB code is optimized in template – at least simply educate you how to use OpenMP* and TBB in your code.

 

I used Intel® C/C++ Compiler 11.066 which is already published.

 

[Source begin]

 

#include<windows.h>

#include<stdio.h>

#include<iostream>

#include<time.h>

#include<omp.h>

 

#include"tbb/task_scheduler_init.h"

#include"tbb/parallel_for.h"

#include"tbb/blocked_range.h"

#include"tbb/spin_mutex.h"

#include"tbb/tick_count.h"

 

constint num_steps = 100000000;

constint num_threads = 4; // My laptop is T61

double step = 0.0, pi = 0.0;

 

static tbb::spin_mutex myMutex;

static CRITICAL_SECTION cs;

 

void Serial_Pi()

{

   double x, sum = 0.0;

   int i;

 

   step = 1.0/(double) num_steps;

   for (i=0; i< num_steps; i++){

      x = (i+0.5)*step;

      sum = sum + 4.0/(1.0 + x*x);

   }

   pi = step * sum;

}

 

 

DWORD WINAPI threadFunction(LPVOID pArg)

{

        double partialSum = 0.0, x;  // local to each thread

        int myNum = *((int *)pArg);

 

        step = 1.0/(double) num_steps;

        for ( int i=myNum; i<num_steps; i+=num_threads )  // use every num_threads step

        {

                x = (i + 0.5)*step;

                partialSum += 4.0 / (1.0 + x*x);  //compute partial sums at each thread

        }

 

        EnterCriticalSection(&cs);

          pi += partialSum * step;  // add partial to global final answer

        LeaveCriticalSection(&cs);

 

        return 0;

}

 

void WinThread_Pi()

{

        HANDLE threadHandles[num_threads];

        int tNum[num_threads];

 

        InitializeCriticalSection(&cs);

        step = 1.0 / num_steps;

        for ( int i=0; i<num_threads; ++i )

        {

                tNum[i] = i;

                threadHandles[i] = CreateThread( NULL,            // Security attributes

                                                 0,               // Stack size

                                                 threadFunction,  // Thread function

                                                 (LPVOID)&tNum[i],// Data for thread func()

                                                 0,               // Thread start mode

                                                 NULL);           // Returned thread ID

        }

        WaitForMultipleObjects(num_threads, threadHandles, TRUE, INFINITE);

 

}

 

void OpenMP_Pi()

{

double x, sum=0.0;

int i;

 

        step = 1.0 / (double)num_steps;

 

omp_set_num_threads(4);

#pragma omp parallel forprivate (x) reduction(+:sum) //schedule(static,4)

 

        for (i=0; i<num_steps; i++)

        {

                x = (i + 0.5)*step;

                sum = sum + 4.0/(1. + x*x);

        }

 

        pi = sum*step;

 

}

 

class ParallelPi {

       

public:

 

        voidoperator() (tbb::blocked_range<int>& range) const {

                double x, sum = 0.0;

                for (int i = range.begin(); i < range.end(); ++i) {

                        x = (i+0.5)*step;

                        sum = sum + 4.0/(1.0 + x*x);

                }

                tbb::spin_mutex::scoped_lock lock(myMutex);

                pi += step * sum;

        }

};

 

void TBB_Pi ()

{

        step = 1.0/(double) num_steps;

        parallel_for (tbb::blocked_range<int> (0, num_steps), ParallelPi(), tbb::auto_partitioner());

}

 

int main()

{

 

        clock_t start, stop;

 

        // Coputing pi by using serial code

        pi = 0.0;

        start = clock();

        Serial_Pi();

        stop = clock();

        printf ("Computed value of Pi by using serial code: %12.9f\n", pi);

        printf ("Elapsed time: %.2f seconds\n", (double)(stop-start)/1000.0);

 

        // Computing pi by using Windows Threads

        pi = 0.0;

        start = clock();

        WinThread_Pi();

        stop = clock();

        printf ("Computed value of Pi by using WinThreads: %12.9f\n", pi);

        printf ("Elapsed time: %.2f seconds\n", (double)(stop-start)/1000.0);

       

        // Computing pi by using OpenMP

        pi = 0.0;

        start = clock();

        OpenMP_Pi();

        stop = clock();

        printf ("Computed value of Pi by using OpenMP: %12.9f\n", pi);

        printf ("Elapsed time: %.2f seconds\n", (double)(stop-start)/1000.0);

 

        // Computing pi by using TBB

        tbb::task_scheduler_init tbb_init;

 

        pi = 0.0;

        start = clock();

        TBB_Pi();

        stop = clock();

        printf ("Computed value of Pi by using TBB: %12.9f\n", pi);

        printf ("Elapsed time: %.2f seconds\n", (double)(stop-start)/1000.0);

 

        return 0;

}

 [End of source]

 

Here are results:

Computed value of Pi by using serial code:  3.141592654

Elapsed time: 0.78 seconds

Computed value of Pi by using WinThreads:  3.141592654

Elapsed time: 0.55 seconds

Computed value of Pi by using OpenMP:  3.141592654

Elapsed time: 0.42 seconds

Computed value of Pi by using TBB:  3.141592654

Elapsed time: 0.44 seconds

 

That is why I recommend to use OpenMP* or TBB instead of WinThreads  

For more complete information about compiler optimizations, see our Optimization Notice.

20 comments

Top
Peter Wang (Intel)'s picture

Don't forget to use compiler options:
/Qopenmp, /Qstd=c++0x

Peter Wang (Intel)'s picture

Here is implementation by using Cilk+

#include <cilk/cilk.h>
#include <cilk/reducer_opadd.h>
...
void Cilk_For_Pi()
{
cilk::reducer_opadd<double> sum(0.0);
double x;

__cilkrts_set_param("nworkers","8"); //set worker counts

#pragma cilk grainsize=num_steps/32;
cilk_for (int i=0; i<num_steps; i++)
{
x = (i+0.5)*step;
sum += 4.0/(1. + x*x);
}

pi += step * sum.get_value();
}

Call function in main()
// Computing pi by using Cilk For Reducer
pi = 0.0;
step = 1.0 / (double)num_steps;
start = clock();
Cilk_For_Pi();
stop = clock();
printf ("Computed value of Pi by using cilk_for_reducer: %12.9fn", pi);
printf ("Elapsed time: %.2f secondsn", (double)(stop-start)/1000.0);

Please use Composer XE 2011 SP1

lynxfisher's picture

I executed the above code, but the result is not in accordance with you.
Reapted as following, with almost not variable:
Computed value of Pi by using serial code: 3.141592654
Elapsed time: 0.75 seconds
Computed value of Pi by using WinThreads: 3.141592654
Elapsed time: 0.20 seconds
Computed value of Pi by using OpenMP: 3.141592654
Elapsed time: 0.20 seconds
Computed value of Pi by using TBB: 3.141592654
Elapsed time: 0.95 seconds

TBB will not amount to what you said?
why? Whehter the dynamic load mechanism(TBB providing) is not commenced, or any else?
I try to specify the number of threads of TBB in initialization, but it is not effective.

maybe there anyother reasons, e.g. configuration in hardware or software?

Pourya Shirazian's picture

This is what I got for running the program: [Make sure you set OpenMP support]

Computed value of Pi by using serial code: 3.141592654
Elapsed time: 1.18 seconds
Computed value of Pi by using WinThreads: 3.141592654
Elapsed time: 0.50 seconds
Computed value of Pi by using OpenMP: 3.141592654
Elapsed time: 0.41 seconds
Computed value of Pi by using TBB: 3.141592654
Elapsed time: 0.33 seconds

anonymous's picture

the result with Intel C++ compiler 11.0:
Computed value of Pi by using serial code: 3.141592654
Elapsed time: 3.25 seconds
Computed value of Pi by using WinThreads: 3.141592654
Elapsed time: 1.78 seconds
Computed value of Pi by using OpenMP: 3.141592654
Elapsed time: 1.61 seconds
Computed value of Pi by using TBB: 3.141592654
Elapsed time: 3.31 seconds

the result with visual studio 2005 compiler:
Computed value of Pi by using serial code: 3.141592654
Elapsed time: 3.30 seconds
Computed value of Pi by using WinThreads: 3.141592654
Elapsed time: 0.84 seconds
Computed value of Pi by using OpenMP: 3.141592654
Elapsed time: 0.88 seconds
Computed value of Pi by using TBB: 3.141592654
Elapsed time: 3.94 seconds

the same code(copy it) My host configuration: Intel Core(TM)2 CPU 6320@1.86G
it is a very strange result and different with all of you, i can't see it. Please explain it, thanks
i set const int num_threads = 2/4; the result is the same;

Peter Wang (Intel)'s picture

I used compiler switch "/Od" to build, which excludes optimizations, like as unrolling, vectorization, etc.

Peter Wang (Intel)'s picture

Thanks for comments. I should not use float data type for conversion. I modified -
DWORD WINAPI threadFunction(LPVOID pArg)
{
double partialSum = 0.0, x; // local to each thread
int myNum = *((int *)pArg);

step = 1.0/(double) num_steps;
for ( int i=myNum; i

Ilnar's picture

Intel's Anti-XSS erase the code where I put "less than" symbol.

results for original num_steps=100million

Computed value of Pi by using serial code: 3.141592654
Elapsed time: 0.59 seconds
Computed value of Pi by using WinThreads: 3.141592654
Elapsed time: 0.30 seconds
Computed value of Pi by using OpenMP: 3.141592654
Elapsed time: 0.34 seconds
Computed value of Pi by using TBB: 3.141592654
Elapsed time: 0.31 seconds

Ilnar's picture

End of the comment was erased (((

DWORD WINAPI threadFunction(LPVOID pArg)
{
double partialSum = 0.0, x; // local to each thread
int myNum = *((int *)pArg);

step = 1.0 / (double)num_steps;

for (int i=myNum; i

Ilnar's picture

sorry,
"has great unrolling techniques" -- I made mistake, I mean "vectorization"

Pages

Add a Comment

Have a technical question? Visit our forums. Have site or software product issues? Contact support.