An example to show performance data for different implementations of Pi calculating

Last time I posted the topic – "Compare Windows* threads, OpenMP*, Intel® Threading Building Blocks for parallel programming" </en-us/blogs/2008/12/16/compare-windows-threads-openmp-intel-threading-building-blocks-for-parallel-programming >, and listed their advantages and disadvantages.

 

Here is a simple example to show you performance data for different implementations of calculating Pi. Yes, this is an extreme simple example but explained my previous post well (you can control threads easily by using traditional Windows* APIs programming, OpenMP* code is effective and TBB code is optimized in template – at least simply educate you how to use OpenMP* and TBB in your code.

 

I used Intel® C/C++ Compiler 11.066 which is already published.

 

[Source begin]

 

#include <windows.h>

#include <stdio.h>

#include <iostream>

#include <time.h>

#include <omp.h>

 

#include "tbb/task_scheduler_init.h"

#include "tbb/parallel_for.h"

#include "tbb/blocked_range.h"

#include "tbb/spin_mutex.h"

#include "tbb/tick_count.h"

 

const int num_steps = 100000000;

const int num_threads = 4; // My laptop is T61

double step = 0.0, pi = 0.0;

 

static tbb::spin_mutex myMutex;

static CRITICAL_SECTION cs;

 

void Serial_Pi()

{

   double x, sum = 0.0;

   int i;

 

   step = 1.0/(double) num_steps;

   for (i=0; i< num_steps; i++){

      x = (i+0.5)*step;

      sum = sum + 4.0/(1.0 + x*x);

   }

   pi = step * sum;

}

 

 

DWORD WINAPI threadFunction(LPVOID pArg)

{

        double partialSum = 0.0, x;  // local to each thread

        int myNum = *((int *)pArg);

 

        step = 1.0/(double) num_steps;

        for ( int i=myNum; i<num_steps; i+=num_threads )  // use every num_threads step

        {

                x = (i + 0.5)*step;

                partialSum += 4.0 / (1.0 + x*x);  //compute partial sums at each thread

        }

 

        EnterCriticalSection(&cs);

          pi += partialSum * step;  // add partial to global final answer

        LeaveCriticalSection(&cs);

 

        return 0;

}

 

void WinThread_Pi()

{

        HANDLE threadHandles[num_threads];

        int tNum[num_threads];

 

        InitializeCriticalSection(&cs);

        step = 1.0 / num_steps;

        for ( int i=0; i<num_threads; ++i )

        {

                tNum[i] = i;

                threadHandles[i] = CreateThread( NULL,            // Security attributes

                                                 0,               // Stack size

                                                 threadFunction,  // Thread function

                                                 (LPVOID)&tNum[i],// Data for thread func()

                                                 0,               // Thread start mode

                                                 NULL);           // Returned thread ID

        }

        WaitForMultipleObjects(num_threads, threadHandles, TRUE, INFINITE);

 

}

 

void OpenMP_Pi()

{

double x, sum=0.0;

int i;

 

        step = 1.0 / (double)num_steps;

 

omp_set_num_threads(4);

#pragma omp parallel for private (x) reduction(+:sum) //schedule(static,4)

 

        for (i=0; i<num_steps; i++)

        {

                x = (i + 0.5)*step;

                sum = sum + 4.0/(1. + x*x);

        }

 

        pi = sum*step;

 

}

 

class ParallelPi {

       

public:

 

        void operator() (tbb::blocked_range<int>& range) const {

                double x, sum = 0.0;

                for (int i = range.begin(); i < range.end(); ++i) {

                        x = (i+0.5)*step;

                        sum = sum + 4.0/(1.0 + x*x);

                }

                tbb::spin_mutex::scoped_lock lock(myMutex);

                pi += step * sum;

        }

};

 

void TBB_Pi ()

{

        step = 1.0/(double) num_steps;

        parallel_for (tbb::blocked_range<int> (0, num_steps), ParallelPi(), tbb::auto_partitioner());

}

 

int main()

{

 

        clock_t start, stop;

 

        // Coputing pi by using serial code

        pi = 0.0;

        start = clock();

        Serial_Pi();

        stop = clock();

        printf ("Computed value of Pi by using serial code: %12.9f\n", pi);

        printf ("Elapsed time: %.2f seconds\n", (double)(stop-start)/1000.0);

 

        // Computing pi by using Windows Threads

        pi = 0.0;

        start = clock();

        WinThread_Pi();

        stop = clock();

        printf ("Computed value of Pi by using WinThreads: %12.9f\n", pi);

        printf ("Elapsed time: %.2f seconds\n", (double)(stop-start)/1000.0);

       

        // Computing pi by using OpenMP

        pi = 0.0;

        start = clock();

        OpenMP_Pi();

        stop = clock();

        printf ("Computed value of Pi by using OpenMP: %12.9f\n", pi);

        printf ("Elapsed time: %.2f seconds\n", (double)(stop-start)/1000.0);

 

        // Computing pi by using TBB

        tbb::task_scheduler_init tbb_init;

 

        pi = 0.0;

        start = clock();

        TBB_Pi();

        stop = clock();

        printf ("Computed value of Pi by using TBB: %12.9f\n", pi);

        printf ("Elapsed time: %.2f seconds\n", (double)(stop-start)/1000.0);

 

        return 0;

}

 [End of source]

 

Here are results:

Computed value of Pi by using serial code:  3.141592654

Elapsed time: 0.78 seconds

Computed value of Pi by using WinThreads:  3.141592654

Elapsed time: 0.55 seconds

Computed value of Pi by using OpenMP:  3.141592654

Elapsed time: 0.42 seconds

Computed value of Pi by using TBB:  3.141592654

Elapsed time: 0.44 seconds

 

That is why I recommend to use OpenMP* or TBB instead of WinThreads  

如需更全面地了解编译器优化,请参阅优化注意事项