Hi All,
I'm evaluating the performance (this time not MKL6 vs MKL11) of MKL11 with 1 thread versus 4 threads.
The 4 thread version seems to be slower. Furthermore, the 4 thread implementation has a huge number of outliners. Does anyone have any explanations, why?
Below the source (float and double are similar), I shortened it for better overview.
Main function:
int _tmain(int argc, _TCHAR* argv[])
{
int threads = 4; //or 1
mkl_set_num_threads(threads);SetPriorityClass(GetCurrentProcess(), HIGH_PRIORITY_CLASS ); // Set a process priority to 'High'
TEST FUNCTION HERE
SetPriorityClass( GetCurrentProcess(), NORMAL_PRIORITY_CLASS ); // Restore the process priority to 'Norma'l
}
TEST FUNCTION
DFTI_DESCRIPTOR_HANDLE hand;
cxdTimeLoops.alloc(loops);// FLOAT
k=0;
for (exp=exp_start;exp<=exp_stop;exp++)
{
Nfft = (unsigned int) pow(2.0,exp);
myRndNumber = 1; //seed
for (i=0;i<Nfft;i++) //get pseudo random signal
{
myRndNumber = NextRand32(myRndNumber);
cxfTimesig[i] = ((float) myRndNumber / UINT_MAX)*2-1;
cxfTimeaxis[i] = ((float) i + 1.0) / fs;
}
hand = 0;
status = DftiCreateDescriptor(&hand, DFTI_SINGLE, DFTI_REAL, 1, Nfft);
status = DftiSetValue(hand, DFTI_PLACEMENT, DFTI_NOT_INPLACE);
status = DftiCommitDescriptor(hand);
for (i=0;i<loops;i++)
{
hpfcTimer.Start(); //start timer for single execution
status = DftiComputeForward(hand, cxfTimesig.ptr(), cxfFreqsig.ptr());
cxdTimeLoops[i] = hpfcTimer.Time();
}
DftiFreeDescriptor(&hand);
dTimeMax = 0;
dTimeMin = cxdTimeLoops[0];
dTimeAvg = 0;
for (i=0;i<loops;i++)
{
dTimeAvg += cxdTimeLoops[i];
dTimeMax = max(cxdTimeLoops[i],dTimeMax);
dTimeMin = min(cxdTimeLoops[i],dTimeMin);
}
dTimeAvg /= (double) loops;
k++;
}// DOUBLE
k=0;
for (exp=exp_start;exp<=exp_stop;exp++)
{
Nfft = (unsigned int) pow(2.0,exp);
cxdFreqsig.alloc(Nfft);
cxdTimesig.alloc(Nfft);
cxdTimeaxis.alloc(Nfft);
myRndNumber = 1; //seed
for (i=0;i<Nfft;i++) //get pseudo random signal
{
myRndNumber = NextRand32(myRndNumber);
cxdTimesig[i] = ((double) myRndNumber / UINT_MAX)*2-1;
cxdTimeaxis[i] = ((double) i + 1.0) / fs;
}hand = 0;
status = DftiCreateDescriptor(&hand, DFTI_DOUBLE, DFTI_REAL, 1, Nfft);
status = DftiSetValue(hand, DFTI_PLACEMENT, DFTI_NOT_INPLACE);
status = DftiCommitDescriptor(hand);
for (i=0;i<loops;i++)
{
hpfcTimer.Start(); //start timer for single execution
status = DftiComputeForward(hand, cxdTimesig.ptr(), cxdFreqsig.ptr());
cxdTimeLoops[i] = hpfcTimer.Time();
}
DftiFreeDescriptor(&hand);
dTimeMax = 0;
dTimeMin = cxdTimeLoops[0];
dTimeAvg = 0;
for (i=0;i<loops;i++)
{
dTimeAvg += cxdTimeLoops[i];
dTimeMax = max(cxdTimeLoops[i],dTimeMax);
dTimeMin = min(cxdTimeLoops[i],dTimeMin);
}
dTimeAvg /= (double) loops;
k++;
}
}
dTimeAvg is plottet versus Nfft for float and double. I'm attaching the individual plots with min/max for visualizing the outliners.
Thanks, Marian





