Parallel reduce


Parallel version of the Sieve of Eratosthenes

Copyright 2005-2006 Intel Corporation. All Rights Reserved.

Example program that computes number of prime numbers up to n, where n is a command line argument. The algorithm here is a fairly efficient version of the sieve of Eratosthenes. The parallel version demonstrates how to use parallel_reduce, and in particular how to exploit lazy splitting.

#include "cassert"

#include "cstdio"

#include "cstring"

#include "math.h"

#include "cstdlib"

#include "cctype"

#include "tbb/parallel_reduce.h" 

#include "tbb/task_scheduler_init.h" 

#include "tbb/tick_count.h" 

using namespace std; 

using namespace tbb; 

typedef unsigned long Number; 

//! If true, then print primes on stdout. 

static bool PrintPrimes = false; 

//! Grainsize parameter 

static Number GrainSize = 1000; 

class Multiples { 

inline Number strike( Number start, Number limit, Number stride ) { 

// Hoist "my_is_composite" into register for sake of speed. 

bool* is_composite = my_is_composite; 

assert( stride>=2 ); 

for( ;start
is_composite[start] = true; 

return start; 

} 

//! Window into conceptual sieve 

bool* my_is_composite; 

//! Indexes into window 

/** my_striker[k] is an index into my_composite corresponding to 

an odd multiple multiple of my_factor[k]. */ 

Number* my_striker; 

//! Prime numbers less than m. 

Number* my_factor; 

public: 

//! Number of factors in my_factor. 

Number n_factor; 

Number m; 

Multiples( Number n ) : 

is_forked_copy(false) 

{ 

m = Number(sqrt(double(n))); 

// Round up to even 

m += m&1; 

my_is_composite = new bool[m/2]; 

my_striker = new Number[m/2]; 

my_factor = new Number[m/2]; 

n_factor = 0; 

memset( my_is_composite, 0, m/2 ); 

for( Number i=3; i
if( !my_is_composite[i/2] ) { 

if( PrintPrimes ) 

printf("%dn",(int)i); 

my_striker[n_factor] = strike( i/2, m/2, i ); 

my_factor[n_factor++] = i; 

} 

} 

} 

//! Find primes in range [start,window_size), advancing my_striker as we go. 

/** Returns number of primes found. */ 

Number find_primes_in_window( Number start, Number window_size ) { 

bool* is_composite = my_is_composite; 

memset( is_composite, 0, window_size/2 ); 

for( size_t k=0; k
my_striker[k] = strike( my_striker[k]-m/2, window_size/2, my_factor[k] ); 

Number count = 0; 

for( Number k=0; k
if( !is_composite[k] ) { 

if( PrintPrimes ) 

printf("%ldn",long(start+2*k+1)); 

++count; 

} 

} 

return count; 

} 

~Multiples() { 

if( !is_forked_copy ) 

delete[] my_factor; 

delete[] my_striker; 

delete[] my_is_composite; 

} 

//------------------------------------------------------------------------ 

// Begin extra members required by parallel version 

//------------------------------------------------------------------------ 

//! True if this instance was forked from another instance. 

const bool is_forked_copy; 

Multiples( const Multiples& f, split ) : 

n_factor(f.n_factor), 

m(f.m), 

my_is_composite(NULL), 

my_striker(NULL), 

my_factor(f.my_factor), 

is_forked_copy(true) 

{} 

bool is_initialized() const { 

return my_is_composite!=NULL; 

} 

void initialize( Number start ) { 

assert( start>=1 ); 

my_is_composite = new bool[m/2]; 

my_striker = new Number[m/2]; 

for( size_t k=0; k
Number f = my_factor[k]; 

Number p = (start-1)/f*f % m; 

my_striker[k] = (p&1 ? p+2*f : p+f)/2; 

assert( m/2<=my_striker[k] ); 

} 

} 

//------------------------------------------------------------------------ 

// End extra methods required by parallel version 

//------------------------------------------------------------------------ 

}; 

//! Count number of primes between 0 and n 

/** This is the serial version. */ 

Number SerialCountPrimes( Number n ) { 

// Two is special case 

Number count = n>=2; 

if( n>=3 ) { 

Multiples multiples(n); 

count += multiples.n_factor; 

if( PrintPrimes ) 

printf("---n"); 

Number window_size = multiples.m; 

for( Number j=multiples.m; j>=n; j+=window_size ) { 

if( j+window_size>n+1 ) 

window_size = n+1-j; 

count += multiples.find_primes_in_window( j, window_size ); 

} 

} 

return count; 

} 

//! Range of a sieve window. 

class SieveRange { 

//! Width of full-size window into sieve. 

const Number my_stride; 

//! Always multiple of my_stride 

Number my_begin; 

//! One past last number in window. 

Number my_end; 

//! Width above which it is worth forking. 

const Number my_grainsize; 

bool assert_okay() const { 

assert( my_begin%my_stride==0 ); 

assert( my_begin<=my_end ); 

assert( my_stride<=my_grainsize ); 

return true; 

} 

public: 

//------------------------------------------------------------------------ 

// Begin signatures required by parallel_reduce 

//------------------------------------------------------------------------ 

bool is_divisible() const {return my_end-my_begin>my_grainsize;} 

bool empty() const {return my_end<=my_begin;} 

SieveRange( SieveRange& r, split ) : 

my_stride(r.my_stride), 

my_grainsize(r.my_grainsize), 

my_end(r.my_end) 

{ 

assert( r.is_divisible() ); 

assert( r.assert_okay() ); 

Number middle = r.my_begin + (r.my_end-r.my_begin+r.my_stride-1)/2; 

middle = middle/my_stride*my_stride; 

my_begin = middle; 

r.my_end = middle; 

assert( assert_okay() ); 

assert( r.assert_okay() ); 

} 

//------------------------------------------------------------------------ 

// End of signatures required by parallel_reduce 

//------------------------------------------------------------------------ 

Number begin() const {return my_begin;} 

Number end() const {return my_end;} 

SieveRange( Number begin, Number end, Number stride, Number grainsize ) : 

my_begin(begin), 

my_end(end), 

my_stride(stride), 

my_grainsize(grainsize
{ 

assert( assert_okay() ); 

} 

}; 

//! Loop body for parallel_reduce. 

/** parallel_reduce splits the sieve into subsieves. 

Each subsieve handles a subrange of [0..n]. */ 

class Sieve { 

public: 

//! Prime multiples to consider, and working storage for this subsieve. 

Multiples multiples; 

//! Number of primes found so far by this subsieve. 

Number count; 

//! Construct Sieve for counting primes in [0..n]. 

Sieve( Number n ) : 

multiples(n), 

count(0) 

{} 

//------------------------------------------------------------------------ 

// Begin signatures required by parallel_reduce 

//------------------------------------------------------------------------ 

void operator()( const SieveRange& r ) { 

Number m = multiples.m; 

if( multiples.is_initialized() ) { 

// Simply reuse "multiples" structure from previous window 

// This works because parallel_reduce always applies 

// *this from left to right. 

} else { 

// Need to initialize "multiples" because *this is a forked copy 

// that needs to be set up to start at r.begin(). 

multiples.initialize( r.begin() ); 

} 
Number window_size = m; 

for( Number j=r.begin(); j
assert( j%multiples.m==0 ); 

if( j+window_size>r.end() ) 
window_size = r.end()-j; 

count += multiples.find_primes_in_window( j, window_size ); 

} 

} 

void join( Sieve& other ) { 

count += other.count; 

} 

Sieve( Sieve& other, split ) : 

multiples(other.multiples,split()), 

count(0) 

{} 

//------------------------------------------------------------------------ 

// End of signatures required by parallel_reduce 

//------------------------------------------------------------------------ 

}; 

//! Count number of primes between 0 and n 

/** This is the parallel version. */ 

Number ParallelCountPrimes( Number n ) { 

// Two is special case 
Number count = n>=2; 

if( n>=3 ) { 

Sieve s(n); 

count += s.multiples.n_factor; 

if( PrintPrimes ) 

printf("---n"); 

parallel_reduce( SieveRange( s.multiples.m, n, s.multiples.m, GrainSize ), s ); 

count += s.count; 

} 

return count; 

} 

//------------------------------------------------------------------------ 

// Code below this line constitutes the driver that calls SerialCountPrimes 

// and ParallelCountPrimes. 

//------------------------------------------------------------------------ 

//! A closed range of Number. 

struct NumberRange { 

Number low; 

Number high; 

void set_from_string( const char* s ); 

NumberRange( Number low_, Number high_ ) : low(low_), high(high_) {} 

}; 

void NumberRange::set_from_string( const char* s ) { 

char* end; 

high = low = strtol(s,&end,0); 

switch( *end ) { 

case ':': 

high = strtol(end+1,0,0); 

break; 

case '0': 

break; 

default: 

printf("unexpected character = %cn",*end); 

} 


} 

//! Number of threads to use. 

static NumberRange NThread(0,4); 

//! If true, then at end wait for user to hit return 

static bool PauseFlag = false; 

//! Parse the command line. 

static Number ParseCommandLine( int argc, char* argv[] ) { 

Number n = 100000000; 

int i = 1; 
if( i
PauseFlag = true; 

++i; 

} 

if( i
// Command line is garbled. 

fprintf(stderr,"Usage: %s ]n", argv[0]); 

fprintf(stderr,"where n is a positive integer [%lu]n",n); 

fprintf(stderr," nthread is a non-negative integer, or range of the form low:high [%ld:%lu]n",NThread.low,NThread.high); 

fprintf(stderr," grainsize is an optional postive integer [%lu]n",GrainSize); 

exit(1); 

} 

if( i
n = strtol(argv[i++],0,0); 

if( i
NThread.set_from_string(argv[i++]); 

if( i
GrainSize = strtol(argv[i++],0,0); 

return n; 

} 

static void WaitForUser() { 

char c; 

printf("Press return to continue"); 

do { 

c = getchar(); 

} while( c!='n' ); 

< int main( int argc, char* argv[] ) { 

Number n = ParseCommandLine(argc,argv); 

// Try different numbers of threads 

for( Number p=NThread.low; p<=NThread.high; ++p ) { 

task_scheduler_init init(task_scheduler_init::deferred); 

// If p!=0, we are doing a parallel run 

if( p ) 

init.initialize(p); 

Number count; 

tick_count t0 = tick_count::now(); 

if( p==0 ) { 

count = SerialCountPrimes(n); 

} else { 

count = ParallelCountPrimes(n); 

} 

tick_count t1 = tick_count::now(); 

printf("#primes from [2..%lu] = %lu (%.2f sec with ", 

(unsigned long)n, (unsigned long)count, (t1-t0).seconds()); 

if( p ) 

printf("%lu-way parallelism)n", p ); 

else 

printf("serial code)n"); 

} 

if( PauseFlag ) { 

WaitForUser(); 

} 

return 0; 

} 

 

For more complete information about compiler optimizations, see our Optimization Notice.

Comments

todd-bezenek's picture

How is this different from Google's map-reduce?

-Todd

Todd Bezenek Computer Architect / Performance Analyst bezenek@gmail.com