Parallel reduce

Submit New Article

March 11, 2009 3:45 PM PDT



Parallel version of the Sieve of Eratosthenes

Copyright 2005-2006 Intel Corporation. All Rights Reserved.

Example program that computes number of prime numbers up to n, where n is a command line argument. The algorithm here is a fairly efficient version of the sieve of Eratosthenes. The parallel version demonstrates how to use parallel_reduce, and in particular how to exploit lazy splitting.

#include "cassert"
#include "cstdio"
#include "cstring"
#include "math.h"
#include "cstdlib"
#include "cctype"
#include "tbb/parallel_reduce.h"
#include "tbb/task_scheduler_init.h"
#include "tbb/tick_count.h"
using namespace std;
using namespace tbb;
typedef unsigned long Number;
//! If true, then print primes on stdout.
static bool PrintPrimes = false;
//! Grainsize parameter
static Number GrainSize = 1000;
class Multiples {
inline Number strike( Number start, Number limit, Number stride ) {
// Hoist "my_is_composite" into register for sake of speed.
bool* is_composite = my_is_composite;
assert( stride>=2 );
for( ;start is_composite[start] = true;
return start;
}
//! Window into conceptual sieve
bool* my_is_composite;
//! Indexes into window
/** my_striker[k] is an index into my_composite corresponding to
an odd multiple multiple of my_factor[k]. */
Number* my_striker;
//! Prime numbers less than m.
Number* my_factor;
public:
//! Number of factors in my_factor.
Number n_factor;
Number m;
Multiples( Number n ) :
is_forked_copy(false)
{
m = Number(sqrt(double(n)));
// Round up to even
m += m&1;
my_is_composite = new bool[m/2];
my_striker = new Number[m/2];
my_factor = new Number[m/2];
n_factor = 0;
memset( my_is_composite, 0, m/2 );
for( Number i=3; i if( !my_is_composite[i/2] ) {
if( PrintPrimes )
printf("%dn",(int)i);
my_striker[n_factor] = strike( i/2, m/2, i );
my_factor[n_factor++] = i;
}
}
}
//! Find primes in range [start,window_size), advancing my_striker as we go.
/** Returns number of primes found. */
Number find_primes_in_window( Number start, Number window_size ) {
bool* is_composite = my_is_composite;
memset( is_composite, 0, window_size/2 );
for( size_t k=0; k my_striker[k] = strike( my_striker[k]-m/2, window_size/2, my_factor[k] );
Number count = 0;
for( Number k=0; k if( !is_composite[k] ) {
if( PrintPrimes )
printf("%ldn",long(start+2*k+1));
++count;
}
}
return count;
}
~Multiples() {
if( !is_forked_copy )
delete[] my_factor;
delete[] my_striker;
delete[] my_is_composite;
}
//------------------------------------------------------------------------
// Begin extra members required by parallel version
//------------------------------------------------------------------------
//! True if this instance was forked from another instance.
const bool is_forked_copy;
Multiples( const Multiples& f, split ) :
n_factor(f.n_factor),
m(f.m),
my_is_composite(NULL),
my_striker(NULL),
my_factor(f.my_factor),
is_forked_copy(true)
{}
bool is_initialized() const {
return my_is_composite!=NULL;
}
void initialize( Number start ) {
assert( start>=1 );
my_is_composite = new bool[m/2];
my_striker = new Number[m/2];
for( size_t k=0; k Number f = my_factor[k];
Number p = (start-1)/f*f % m;
my_striker[k] = (p&1 ? p+2*f : p+f)/2;
assert( m/2<=my_striker[k] );
}
}
//------------------------------------------------------------------------
// End extra methods required by parallel version
//------------------------------------------------------------------------
};
//! Count number of primes between 0 and n
/** This is the serial version. */
Number SerialCountPrimes( Number n ) {
// Two is special case
Number count = n>=2;
if( n>=3 ) {
Multiples multiples(n);
count += multiples.n_factor;
if( PrintPrimes )
printf("---n");
Number window_size = multiples.m;
for( Number j=multiples.m; j>=n; j+=window_size ) {
if( j+window_size>n+1 )
window_size = n+1-j;
count += multiples.find_primes_in_window( j, window_size );
}
}
return count;
}
//! Range of a sieve window.
class SieveRange {
//! Width of full-size window into sieve.
const Number my_stride;
//! Always multiple of my_stride
Number my_begin;
//! One past last number in window.
Number my_end;
//! Width above which it is worth forking.
const Number my_grainsize;
bool assert_okay() const {
assert( my_begin%my_stride==0 );
assert( my_begin<=my_end );
assert( my_stride<=my_grainsize );
return true;
}
public:
//------------------------------------------------------------------------
// Begin signatures required by parallel_reduce
//------------------------------------------------------------------------
bool is_divisible() const {return my_end-my_begin>my_grainsize;}
bool empty() const {return my_end<=my_begin;}
SieveRange( SieveRange& r, split ) :
my_stride(r.my_stride),
my_grainsize(r.my_grainsize),
my_end(r.my_end)
{
assert( r.is_divisible() );
assert( r.assert_okay() );
Number middle = r.my_begin + (r.my_end-r.my_begin+r.my_stride-1)/2;
middle = middle/my_stride*my_stride;
my_begin = middle;
r.my_end = middle;
assert( assert_okay() );
assert( r.assert_okay() );
}
//------------------------------------------------------------------------
// End of signatures required by parallel_reduce
//------------------------------------------------------------------------
Number begin() const {return my_begin;}
Number end() const {return my_end;}
SieveRange( Number begin, Number end, Number stride, Number grainsize ) :
my_begin(begin),
my_end(end),
my_stride(stride),
my_grainsize(grainsize {
assert( assert_okay() );
}
};
//! Loop body for parallel_reduce.
/** parallel_reduce splits the sieve into subsieves.
Each subsieve handles a subrange of [0..n]. */
class Sieve {
public:
//! Prime multiples to consider, and working storage for this subsieve.
Multiples multiples;
//! Number of primes found so far by this subsieve.
Number count;
//! Construct Sieve for counting primes in [0..n].
Sieve( Number n ) :
multiples(n),
count(0)
{}
//------------------------------------------------------------------------
// Begin signatures required by parallel_reduce
//------------------------------------------------------------------------
void operator()( const SieveRange& r ) {
Number m = multiples.m;
if( multiples.is_initialized() ) {
// Simply reuse "multiples" structure from previous window
// This works because parallel_reduce always applies
// *this from left to right.
} else {
// Need to initialize "multiples" because *this is a forked copy
// that needs to be set up to start at r.begin().
multiples.initialize( r.begin() );
}
Number window_size = m;
for( Number j=r.begin(); j assert( j%multiples.m==0 );
if( j+window_size>r.end() )
window_size = r.end()-j;
count += multiples.find_primes_in_window( j, window_size );
}
}
void join( Sieve& other ) {
count += other.count;
}
Sieve( Sieve& other, split ) :
multiples(other.multiples,split()),
count(0)
{}
//------------------------------------------------------------------------
// End of signatures required by parallel_reduce
//------------------------------------------------------------------------
};
//! Count number of primes between 0 and n
/** This is the parallel version. */
Number ParallelCountPrimes( Number n ) {
// Two is special case
Number count = n>=2;
if( n>=3 ) {
Sieve s(n);
count += s.multiples.n_factor;
if( PrintPrimes )
printf("---n");
parallel_reduce( SieveRange( s.multiples.m, n, s.multiples.m, GrainSize ), s );
count += s.count;
}
return count;
}
//------------------------------------------------------------------------
// Code below this line constitutes the driver that calls SerialCountPrimes
// and ParallelCountPrimes.
//------------------------------------------------------------------------
//! A closed range of Number.
struct NumberRange {
Number low;
Number high;
void set_from_string( const char* s );
NumberRange( Number low_, Number high_ ) : low(low_), high(high_) {}
};
void NumberRange::set_from_string( const char* s ) {
char* end;
high = low = strtol(s,&end,0);
switch( *end ) {
case ':':
high = strtol(end+1,0,0);
break;
case '0':
break;
default:
printf("unexpected character = %cn",*end);
}

}
//! Number of threads to use.
static NumberRange NThread(0,4);
//! If true, then at end wait for user to hit return
static bool PauseFlag = false;
//! Parse the command line.
static Number ParseCommandLine( int argc, char* argv[] ) {
Number n = 100000000;
int i = 1;
if( i PauseFlag = true;
++i;
}
if( i // Command line is garbled.
fprintf(stderr,"Usage: %s [['pause'] n [nthread [grainsize]]]n", argv[0]);
fprintf(stderr,"where n is a positive integer [%lu]n",n);
fprintf(stderr," nthread is a non-negative integer, or range of the form low:high [%ld:%lu]n",NThread.low,NThread.high);
fprintf(stderr," grainsize is an optional postive integer [%lu]n",GrainSize);
exit(1);
}
if( i n = strtol(argv[i++],0,0);
if( i NThread.set_from_string(argv[i++]);
if( i GrainSize = strtol(argv[i++],0,0);
return n;
}
static void WaitForUser() {
char c;
printf("Press return to continue");
do {
c = getchar();
} while( c!='n' );
< int main( int argc, char* argv[] ) {
Number n = ParseCommandLine(argc,argv);
// Try different numbers of threads
for( Number p=NThread.low; p<=NThread.high; ++p ) {
task_scheduler_init init(task_scheduler_init::deferred);
// If p!=0, we are doing a parallel run
if( p )
init.initialize(p);
Number count;
tick_count t0 = tick_count::now();
if( p==0 ) {
count = SerialCountPrimes(n);
} else {
count = ParallelCountPrimes(n);
}
tick_count t1 = tick_count::now();
printf("#primes from [2..%lu] = %lu (%.2f sec with ",
(unsigned long)n, (unsigned long)count, (t1-t0).seconds());
if( p )
printf("%lu-way parallelism)n", p );
else
printf("serial code)n");
}
if( PauseFlag ) {
WaitForUser();
}
return 0;
}