auto_partitioner should be a good choice for the most common problems. But...
I've tried the following code:
class RenderByLines {
LPDWORD* const my_lines;
public:
void Perform(const int from, const int to) const {
LPDWORD* lines = my_lines;
for (int yy = from; yy != to; yy++) {
int y = 127 - (yy / 2);
if ((yy & 0x1) == 1) {
y = 255 - y;
}
LPDWORD line = lines[y];
for (int x = 0; x < 256; x++) {
int q = 1;
for (int z = 0; z < 256; z++) {
q = (q * x + z) * y;
}
line[x] = x + (256 * y) + (65536 * (q & 0xFF));
}
}
}
void operator()(const blocked_range& r) const {
Perform(r.begin(), r.end());
}
RenderByLines(LPDWORD* lines) :
my_lines(lines)
{}
};
void testPF() {
tick_count t0 = tick_count::now();
parallel_for(blocked_range(0, 256, 1), RenderByLines(microLines));
tick_count t1 = tick_count::now();
fprintf(debugFile, "test parallel_for: %d mksn", (int)((t1-t0).seconds() * 1000000));
}
void testPFA() {
tick_count t0 = tick_count::now();
parallel_for(blocked_range(0, 256), RenderByLines(microLines), auto_partitioner());
tick_count t1 = tick_count::now();
fprintf(debugFile, "test parallel_for_auto: %d mksn", (int)((t1-t0).seconds() * 1000000));
}
void testS() {
tick_count t0 = tick_count::now();
RenderByLines(microLines).Perform(0, 256);
tick_count t1 = tick_count::now();
fprintf(debugFile, "test sequential: %d mksn", (int)((t1-t0).seconds() * 1000000));
}
void PerformParallelRaytrace() {
for (int i = 0; i < 8; i++) {testPF();}
for (int i = 0; i < 8; i++) {testPFA();}
for (int i = 0; i < 8; i++) {testS();}
for (int i = 0; i < 8; i++) {testPF();}
for (int i = 0; i < 8; i++) {testPFA();}
for (int i = 0; i < 8; i++) {testS();}
}
And got results:
test parallel_for: 102283 mks test parallel_for: 74223 mks test parallel_for: 74463 mks test parallel_for: 78521 mks test parallel_for: 80608 mks test parallel_for: 76435 mks test parallel_for: 78746 mks test parallel_for: 76028 mks test parallel_for_auto: 77084 mks test parallel_for_auto: 79055 mks test parallel_for_auto: 69775 mks test parallel_for_auto: 74822 mks test parallel_for_auto: 80514 mks test parallel_for_auto: 76490 mks test parallel_for_auto: 92116 mks test parallel_for_auto: 92822 mks test sequential: 122995 mks test sequential: 128488 mks test sequential: 120866 mks test sequential: 121859 mks test sequential: 120693 mks test sequential: 120593 mks test sequential: 120668 mks test sequential: 120733 mks test parallel_for: 71991 mks test parallel_for: 73140 mks test parallel_for: 73169 mks test parallel_for: 71933 mks test parallel_for: 73696 mks test parallel_for: 73777 mks test parallel_for: 73173 mks test parallel_for: 77050 mks test parallel_for_auto: 77619 mks test parallel_for_auto: 74510 mks test parallel_for_auto: 76760 mks test parallel_for_auto: 73786 mks test parallel_for_auto: 74553 mks test parallel_for_auto: 73108 mks test parallel_for_auto: 73642 mks test parallel_for_auto: 73634 mks test sequential: 120947 mks test sequential: 120867 mks test sequential: 122299 mks test sequential: 120984 mks test sequential: 121005 mks test sequential: 120843 mks test sequential: 120354 mks test sequential: 120945 mks
Seems to be very strange. When there were no internal cycle (calculating q) - auto_partitioner worked good - is was about 15% faster. But on long lasting computations - auto_partitioner consumes about 2 additional milliseconds!
I can't understand this result.



