Error compiling kernel for GPU

Error compiling kernel for GPU

peastman的头像

I'm testing my code on an Ivy Bridge processor under Windows 7. When I tell it to use the CPU device, it works correctly. When I try to run on the GPU device, clCreateProgramWithSource() fails with the following error:

igdfcl32.dll successfully completed build.
error: Cannot yet select: 0x2fe3f10: i8 = any_extend 0x3d24138 [ID=66]
0x3d24138: i32 = and 0x3d240b0, 0x3e5ac30 [ID=63]
0x3d240b0: i32 = xor 0x2fe31c8, 0x3d233f0 [ID=61]
0x2fe31c8: i32 = GHAL3DISD::GHAL3DSETCC 0x3e5c078, 0x3e5ab20, 0x3d23f18 [ID=59]
0x3e5c078: f32,ch = llvm.GHAL3D.fabsf 0x2fe2f48, 0x3e5b0f8, 0x3d23830 [ORD=123] [ID=57]
0x2fe2f48: ch = EntryToken [ORD=88] [ID=0]
0x3e5b0f8: i32 = Constant<50> [ID=18]
0x3d23830: f32 = fdiv 0x3e5acb8, 0x3e5c430 [ORD=103] [ID=55]
0x3e5acb8: f32 = fmul 0x3e5aa98, 0x2fe3690 [ORD=102] [ID=54]
0x3e5aa98: f32 = fmul 0x3e5b4c8, 0x3d23258 [ORD=101] [ID=37]
0x3e5b4c8: f32,ch = CopyFromReg 0x2fe2f48, 0x2fe3be0 [ORD=101] [ID=25]
0x2fe2f48: ch = EntryToken [ORD=88] [ID=0]
0x2fe3be0: f32 = Register %reg1038 [ORD=101] [ID=6]
0x3d23258: f32 = ConstantFP<-2.000000e+000> [ORD=101] [ID=7]
0x2fe3690: f32 = fadd 0x3d23a50, 0x2fe3cf0 [ORD=100] [ID=53]
0x3d23a50: f32 = fadd 0x3d23500, 0x2fe3608 [ORD=96] [ID=52]
0x3d23500: f32 = fmul 0x2fe3e00, 0x3e5c100 [ORD=92] [ID=47]
0x2fe3e00: f32 = GHAL3DISD::EXTRACTX 0x3d5ab58 [ID=40]
0x3d5ab58: v4f32 = fsub 0x3e5ae50, 0x3d5aad0 [ORD=89] [ID=33]
0x3e5ae50: v4f32,ch = CopyFromReg 0x2fe2f48, 0x3d5a828 [ORD=89] [ID=22]
0x2fe2f48: ch = EntryToken [ORD=88] [ID=0]
0x3d5a828: v4f32 = Register %reg1057 [ORD=89] [ID=3]
0x3d5aad0: v4f32,ch = CopyFromReg 0x2fe2f48, 0x3e5a878 [ORD=89] [ID=23]
0x2fe2f48: ch = EntryToken [ORD=88] [ID=0]
0x3e5a878: v4f32 = Register %reg1056 [ORD=89] [ID=4]
0x3e5c100: f32 = GHAL3DISD::EXTRACTX 0x3d5abe0 [ID=34]
0x3d5abe0: v4f32,ch = CopyFromReg 0x2fe2f48, 0x2fe3580 [ORD=91] [ID=24]
0x2fe2f48: ch = EntryToken [ORD=88] [ID=0]
0x2fe3580: v4f32 = Register %reg1048 [ORD=91] [ID=5]
0x2fe3608: f32 = fmul 0x3d232e0, 0x2fe3938 [ORD=95] [ID=48]
0x3d232e0: f32 = GHAL3DISD::EXTRACTY 0x3d5ab58 [ID=41]
0x3d5ab58: v4f32 = fsub 0x3e5ae50, 0x3d5aad0 [ORD=89] [ID=33]
0x3e5ae50: v4f32,ch = CopyFromReg 0x2fe2f48, 0x3d5a828 [ORD=89] [ID=22]
0x2fe2f48: ch = EntryToken [ORD=88] [ID=0]
0x3d5a828: v4f32 = Register %reg1057 [ORD=89] [ID=3]
0x3d5aad0: v4f32,ch = CopyFromReg 0x2fe2f48, 0x3e5a878 [ORD=89] [ID=23]
0x2fe2f48: ch = EntryToken [ORD=88] [ID=0]
0x3e5a878: v4f32 = Register %reg1056 [ORD=89] [ID=4]
0x2fe3938: f32 = GHAL3DISD::EXTRACTY 0x3d5abe0 [ID=35]
0x3d5abe0: v4f32,ch = CopyFromReg 0x2fe2f48, 0x2fe3580 [ORD=91] [ID=24]
0x2fe2f48: ch = EntryToken [ORD=88] [ID=0]
0x2fe3580: v4f32 = Register %reg1048 [ORD=91] [ID=5]
0x2fe3cf0: f32 = fmul 0x3d23610, 0x2fe37a0 [ORD=99] [ID=49]
0x3d23610: f32 = GHAL3DISD::EXTRACTZ 0x3d5ab58 [ID=42]
0x3d5ab58: v4f32 = fsub 0x3e5ae50, 0x3d5aad0 [ORD=89] [ID=33]
0x3e5ae50: v4f32,ch = CopyFromReg 0x2fe2f48, 0x3d5a828 [ORD=89] [ID=22]
0x2fe2f48: ch = EntryToken [ORD=88] [ID=0]
0x3d5a828: v4f32 = Register %reg1057 [ORD=89] [ID=3]
0x3d5aad0: v4f32,ch = CopyFromReg 0x2fe2f48, 0x3e5a878 [ORD=89] [ID=23]
0x2fe2f48: ch = EntryToken [ORD=88] [ID=0]
0x3e5a878: v4f32 = Register %reg1056 [ORD=89] [ID=4]
0x2fe37a0: f32 = GHAL3DISD::EXTRACTZ 0x3d5abe0 [ID=36]
0x3d5abe0: v4f32,ch = CopyFromReg 0x2fe2f48, 0x2fe3580 [ORD=91] [ID=24]
0x2fe2f48: ch = EntryToken [ORD=88] [ID=0]
0x2fe3580: v4f32 = Register %reg1048 [ORD=91] [ID=5]
0x3e5c430: f32,ch = CopyFromReg 0x2fe2f48, 0x3e5a6e0 [ORD=103] [ID=26]
0x2fe2f48: ch = EntryToken [ORD=88] [ID=0]
0x3e5a6e0: f32 = Register %reg1051 [ORD=103] [ID=8]
0x3e5ab20: f32,ch = CopyFromReg 0x2fe2f48, 0x3e5a988 [ORD=124] [ID=29]
0x2fe2f48: ch = EntryToken [ORD=88] [ID=0]
0x3e5a988: f32 = Register %reg1025 [ORD=124] [ID=11]
0x3d23f18: ch = setgt [ID=20]
0x3d233f0: i32 = Constant<-1> [ORD=88] [ID=2]
0x3e5ac30: i32 = Constant<1> [ID=17]
error: midlevel compiler failed build.

Any idea what this means?

Peter

6 帖子 / 0 new
最新文章
如需更全面地了解编译器优化,请参阅优化注意事项.
Brijender Bharti的头像

Its hard to know from the error log. If you can share the code that will be great. Or I will suggest start deleting the code from kernel to pin point to a small set of lines which causes this issue.

peastman的头像

Here is the source code for the kernel. This is part of OpenMM (https://simtk.org/home/openmm):

#define EXP native_exp #define LOG native_log #define RECIP native_recip #define RSQRT native_rsqrt #define SQRT native_sqrt #define WORK_GROUP_SIZE 64 #define CONSTRAIN_VELOCITIES 1 /** * Enforce constraints on SHAKE clusters */ __kernel void applyShakeToHydrogens(int numClusters, float tol, __global const float4* restrict oldPos, __global float4* restrict posDelta, __global const int4* restrict clusterAtoms, __global const float4* restrict clusterParams) { int index = get_global_id(0); while (index < numClusters) { // Load the data for this cluster. int4 atoms = clusterAtoms[index]; float4 params = clusterParams[index]; float4 pos = oldPos[atoms.x]; float4 xpi = posDelta[atoms.x]; float4 pos1 = oldPos[atoms.y]; float4 xpj1 = posDelta[atoms.y]; float4 pos2 = {0.0f, 0.0f, 0.0f, 0.0f}; float4 xpj2 = {0.0f, 0.0f, 0.0f, 0.0f}; float invMassCentral = params.x; float avgMass = params.y; float d2 = params.z; float invMassPeripheral = params.w; if (atoms.z != -1) { pos2 = oldPos[atoms.z]; xpj2 = posDelta[atoms.z]; } float4 pos3 = {0.0f, 0.0f, 0.0f, 0.0f}; float4 xpj3 = {0.0f, 0.0f, 0.0f, 0.0f}; if (atoms.w != -1) { pos3 = oldPos[atoms.w]; xpj3 = posDelta[atoms.w]; } // Precompute quantities. float4 rij1 = pos-pos1; float4 rij2 = pos-pos2; float4 rij3 = pos-pos3; float rij1sq = rij1.x*rij1.x + rij1.y*rij1.y + rij1.z*rij1.z; float rij2sq = rij2.x*rij2.x + rij2.y*rij2.y + rij2.z*rij2.z; float rij3sq = rij3.x*rij3.x + rij3.y*rij3.y + rij3.z*rij3.z; float ld1 = d2-rij1sq; float ld2 = d2-rij2sq; float ld3 = d2-rij3sq; // Iterate until convergence. bool converged = false; int iteration = 0; while (iteration < 15 && !converged) { converged = true; #ifdef CONSTRAIN_VELOCITIES float4 rpij = xpi-xpj1; float rrpr = rpij.x*rij1.x + rpij.y*rij1.y + rpij.z*rij1.z; float delta = -2.0f*avgMass*rrpr/rij1sq; float4 dr = rij1*delta; xpi.xyz += dr.xyz*invMassCentral; xpj1.xyz -= dr.xyz*invMassPeripheral; if (fabs(delta) > tol) converged = false; if (atoms.z != -1) { rpij = xpi-xpj2; rrpr = rpij.x*rij2.x + rpij.y*rij2.y + rpij.z*rij2.z; delta = -2.0f*avgMass*rrpr/rij2sq; dr = rij2*delta; xpi.xyz += dr.xyz*invMassCentral; xpj2.xyz -= dr.xyz*invMassPeripheral; if (fabs(delta) > tol) converged = false; } if (atoms.w != -1) { rpij = xpi-xpj3; rrpr = rpij.x*rij3.x + rpij.y*rij3.y + rpij.z*rij3.z; delta = -2.0f*avgMass*rrpr/rij3sq; dr = rij3*delta; xpi.xyz += dr.xyz*invMassCentral; xpj3.xyz -= dr.xyz*invMassPeripheral; if (fabs(delta) > tol) converged = false; } #else float4 rpij = xpi-xpj1; float rpsqij = rpij.x*rpij.x + rpij.y*rpij.y + rpij.z*rpij.z; float rrpr = rij1.x*rpij.x + rij1.y*rpij.y + rij1.z*rpij.z; float diff = fabs(ld1-2.0f*rrpr-rpsqij) / (d2*tol); if (diff >= 1.0f) { float acor = (ld1-2.0f*rrpr-rpsqij)*avgMass / (rrpr+rij1sq); float4 dr = rij1*acor; xpi.xyz += dr.xyz*invMassCentral; xpj1.xyz -= dr.xyz*invMassPeripheral; converged = false; } if (atoms.z != -1) { rpij.xyz = xpi.xyz-xpj2.xyz; rpsqij = rpij.x*rpij.x + rpij.y*rpij.y + rpij.z*rpij.z; rrpr = rij2.x*rpij.x + rij2.y*rpij.y + rij2.z*rpij.z; diff = fabs(ld2-2.0f*rrpr-rpsqij) / (d2*tol); if (diff >= 1.0f) { float acor = (ld2 - 2.0f*rrpr - rpsqij)*avgMass / (rrpr + rij2sq); float4 dr = rij2*acor; xpi.xyz += dr.xyz*invMassCentral; xpj2.xyz -= dr.xyz*invMassPeripheral; converged = false; } } if (atoms.w != -1) { rpij.xyz = xpi.xyz-xpj3.xyz; rpsqij = rpij.x*rpij.x + rpij.y*rpij.y + rpij.z*rpij.z; rrpr = rij3.x*rpij.x + rij3.y*rpij.y + rij3.z*rpij.z; diff = fabs(ld3 - 2.0f*rrpr - rpsqij) / (d2*tol); if (diff >= 1.0f) { float acor = (ld3-2.0f*rrpr-rpsqij)*avgMass / (rrpr+rij3sq); float4 dr = rij3*acor; xpi.xyz += dr.xyz*invMassCentral; xpj3.xyz -= dr.xyz*invMassPeripheral; converged = false; } } #endif iteration++; } // Record the new positions. posDelta[atoms.x] = xpi; posDelta[atoms.y] = xpj1; if (atoms.z != -1) posDelta[atoms.z] = xpj2; if (atoms.w != -1) posDelta[atoms.w] = xpj3; index += get_global_size(0); } }

Peter

Raghu Muthyalampalli (Intel)的头像

Hi Peter,

Thanks for providing the kernel. I am able to reproduce the build error on the GPU using the 2012SDK. I'll get back to you on my findings after debug it more.

Thanks,
Raghu

Raghu Muthyalampalli (Intel)的头像

I was able to narrow down the test case to a few lines causing the build to fail. I have filed a bug and will let you know if development suggests a fix or a workaround.

Thanks,
Raghu

peastman的头像

Great, thank you for the update.

Peter

登陆并发表评论。