Problems with Intel HD Graphics 4000 video card

Problems with Intel HD Graphics 4000 video card

Ritratto di Vanessa O.

I created an OpenCL program to execute some mathematic operations. It worked fine in Nvidia and AMD video cards (result matched with wolframalpha, for instance) but when I tried to use a notebook with an Intel HD Graphics 4000 video card, that supports openCL, my program returned unexpected results for some functions.
Those functions were erf, erfc, tgamma and lgamma. I thought my video card had a problem, but I tried in another 3 machines with Intel HD Graphics 4000 and obtained the same results from the first one.

Some examples:

Intel HD Graphics 4000
erf( 0.25 ) = 0.0262698
erfc( 0.25 ) = 0.97373
lgamma( 5.5 ) = 1.70475

Another video cards:
erf( 0.25 ) = 0.276326
erfc( 0.25 ) = 0.723674
lgamma( 5.5 ) = 3.95781

I tried it configurations
OpenCL version: 1.1
OpenCL driver version: 8.15.10.2778

and
OpenCL version: 1.1
OpenCL driver version: 9.17.10.2932

Does anyone know the reason behind the different results?

5 post / 0 new
Ultimo contenuto
Per informazioni complete sulle ottimizzazioni del compilatore, consultare l'Avviso sull'ottimizzazione
Ritratto di Raghu Muthyalampalli (Intel)

Hi,

Do you have a reproducer? I can probably try to create one but it'll be easier and faster if you already have a reproducer.

Thanks,
Raghu

 

Ritratto di Vanessa O.

Hi,

Here is a simple program with a kernel that calculates the values on the functions I mentioned before, just for testing purposes. It will get only the first device, so the computer must have only the intel hd 4000 to notice the issue with this program.

#include <iostream>
#include <fstream>
#include <string.h>
using namespace std;
#include <CL/cl.h>
#define DATA_SIZE (1*1*1)
const char *KernelSource = " \n" 
 "__kernel void math( \n" 
 " const float input0, \n" 
 " const float input1, \n" 
 " __global float* output0, \n" 
 " __global float* output1, \n" 
 " __global float* output2, \n" 
 " __global float* output3, \n" 
 " const unsigned int count) \n" 
 "{ \n" 
 " int i = get_global_id(0); \n" 
 " if(i < count){ \n" 
 " output0[i] = erf(input0); \n" 
 " output1[i] = erfc(input0); \n" 
 " output2[i] = tgamma(input1); \n" 
 " output3[i] = lgamma(input1); \n" 
 "}} \n" 
 "\n";
int main(int argc, char* argv[])
{
 int devType = CL_DEVICE_TYPE_GPU;
cl_int err; // error code returned from api calls
size_t global; // global domain size for our calculation
 size_t local;
cl_platform_id cpPlatform; // OpenCL platform
 cl_device_id device_id; // compute device id
 cl_context context; // compute context
 cl_command_queue commands; // compute command queue
 cl_program program; // compute program
 cl_kernel kernel; // compute kernel
err = clGetPlatformIDs(1, &cpPlatform, NULL);
 err = clGetDeviceIDs(cpPlatform, devType, 1, &device_id, NULL);
 context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
 commands = clCreateCommandQueue(context, device_id, 0, &err);
 program = clCreateProgramWithSource(context, 1,
 (const char **) &KernelSource,
 NULL, &err);
 err = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
 kernel = clCreateKernel(program, "math", &err);
cl_float defined_data0[5] = {0.000000000000f, 0.250000000000f, 0.750000000000f, 1.200000000000f, 2.100000000000f};
 cl_float defined_data1[5] = {5.500000000000f, 7.400000000000f, 1.230000000000f, 2.780000000000f, 6.890000000000f};
 cl_float expected_data0[5] = { 0.0000000000000000000000000f,
 0.2763263901682369295107790f,
 0.7111556336535151327631066f,
 0.9103139782296353688714058f,
 0.9970205333436670158317981f,};
 cl_float expected_data1[5] = { 1.0000000000000000000000000f,
 0.7236736098317630704892210f,
 0.2888443663464848672368934f,
 0.0896860090223934799878963f,
 0.0029794679643595611119777f,
 };
cl_float expected_data2[5] = { 52.3427777845535207174032521f,
 1541.3364759429129485779341735f,
 0.9107548520004719437180227f,
 1.6486845657792377944960219f,
 586.5036549713060703425959730f,
 };
 cl_float expected_data3[5] = { 3.9578139676187164873802371f,
 7.3404051605723232942594947f,
 -0.0934815156297828550078870f,
 0.4999777370923010424194188f,
 6.3741788999092460674643357f,
 };
for(int iterator=0; iterator<5; iterator++){
cl_mem output0 = clCreateBuffer(context, CL_MEM_WRITE_ONLY, DATA_SIZE*sizeof(float), NULL, &err);
 cl_mem output1 = clCreateBuffer(context, CL_MEM_WRITE_ONLY, DATA_SIZE*sizeof(float), NULL, &err);
 cl_mem output2 = clCreateBuffer(context, CL_MEM_WRITE_ONLY, DATA_SIZE*sizeof(float), NULL, &err);
 cl_mem output3 = clCreateBuffer(context, CL_MEM_WRITE_ONLY, DATA_SIZE*sizeof(float), NULL, &err);
 unsigned int count = DATA_SIZE;
err=0;
 err = clSetKernelArg(kernel, 0, sizeof(cl_float), &defined_data0[iterator]);
 err = clSetKernelArg(kernel, 1, sizeof(cl_float), &defined_data1[iterator]);
err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &output0);
 err |= clSetKernelArg(kernel, 3, sizeof(cl_mem), &output1);
 err |= clSetKernelArg(kernel, 4, sizeof(cl_mem), &output2);
 err |= clSetKernelArg(kernel, 5, sizeof(cl_mem), &output3);
err |= clSetKernelArg(kernel, 6, sizeof(unsigned int), &count);
global = DATA_SIZE;
 local = 1;
err=0;
 err = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
clFinish(commands);
cl_float* results0 = (cl_float *)clEnqueueMapBuffer(commands, output0, CL_TRUE, CL_MAP_READ, 0, sizeof(float)*DATA_SIZE, NULL, NULL, NULL, &err);
 cl_float* results1 = (cl_float *)clEnqueueMapBuffer(commands, output1, CL_TRUE, CL_MAP_READ, 0, sizeof(float)*DATA_SIZE, NULL, NULL, NULL, &err);
 cl_float* results2 = (cl_float *)clEnqueueMapBuffer(commands, output2, CL_TRUE, CL_MAP_READ, 0, sizeof(float)*DATA_SIZE, NULL, NULL, NULL, &err);
 cl_float* results3 = (cl_float *)clEnqueueMapBuffer(commands, output3, CL_TRUE, CL_MAP_READ, 0, sizeof(float)*DATA_SIZE, NULL, NULL, NULL, &err);
cout.precision(10);
 cout << fixed << "Expected result:\t" << "erf( " << defined_data0[iterator] << " ) = " << expected_data0[iterator] << endl;
 cout << fixed << "Calculated result:\t" << "erf( " << defined_data0[iterator] << " ) = " << results0[0] << endl;
 cout << endl;
 cout << fixed << "Expected result:\t" << "erfc( " << defined_data0[iterator] << " ) = " << expected_data1[iterator] << endl;
 cout << fixed << "Calculated result:\t" << "erfc( " << defined_data0[iterator] << " ) = " << results1[0] << endl;
 cout << endl;
 cout << fixed << "Expected result:\t" << "tgamma( " << defined_data1[iterator] << " ) = " << expected_data2[iterator] << endl;
 cout << fixed << "Calculated result:\t" << "tgamma( " << defined_data1[iterator] << " ) = " << results2[0] << endl;
 cout << endl;
 cout << fixed << "Expected result:\t" << "lgamma( " << defined_data1[iterator] << " ) = " << expected_data3[iterator] << endl;
 cout << fixed << "Calculated result:\t" << "lgamma( " << defined_data1[iterator] << " ) = " << results3[0] << endl;
 cout << endl;
 cout << "-------------------------------" << endl;
 cout << endl;
}
system("PAUSE");
return 0;
}

Expected results were calculated on CPU with c++ math.h functions, and compared with wolfram alpha to make sure they were right.

Here are some screenshots I took from this program execution:

On Intel HD 4000

On AMD Radeon HD 6570 and NVIDIA Quadro 600

And TY in advance.

Ritratto di Ben Ashbaugh (Intel)

Hi Vanessa,

I believe I have reproduced all of these issues.  A few of them were already fixed.  If you aren't using the latest driver, you may try upgrading to see if that helps.  The others are fixed internally, but it may be some time before a driver is released with the fixes.

Thanks!

  -- Ben

Ritratto di Vanessa O.

Hi Ben,

Thanks for help. I tried the driver version 15.31.17.3257, from 09/11/2013, and some of the values were fixed, but some weren't.
Is there an estimated time for the corrections to be released?

(A screenshot of the same program running on the latest driver) 

 

Thanks for help! :)

Accedere per lasciare un commento.