Error compiling kernel for GPU

Error compiling kernel for GPU

Portrait de peastman

I'm testing my code on an Ivy Bridge processor under Windows 7. When I tell it to use the CPU device, it works correctly. When I try to run on the GPU device, clCreateProgramWithSource() fails with the following error:

igdfcl32.dll successfully completed build.
error: Cannot yet select: 0x2fe3f10: i8 = any_extend 0x3d24138 [ID=66]
0x3d24138: i32 = and 0x3d240b0, 0x3e5ac30 [ID=63]
0x3d240b0: i32 = xor 0x2fe31c8, 0x3d233f0 [ID=61]
0x2fe31c8: i32 = GHAL3DISD::GHAL3DSETCC 0x3e5c078, 0x3e5ab20, 0x3d23f18 [ID=59]
0x3e5c078: f32,ch = llvm.GHAL3D.fabsf 0x2fe2f48, 0x3e5b0f8, 0x3d23830 [ORD=123] [ID=57]
0x2fe2f48: ch = EntryToken [ORD=88] [ID=0]
0x3e5b0f8: i32 = Constant<50> [ID=18]
0x3d23830: f32 = fdiv 0x3e5acb8, 0x3e5c430 [ORD=103] [ID=55]
0x3e5acb8: f32 = fmul 0x3e5aa98, 0x2fe3690 [ORD=102] [ID=54]
0x3e5aa98: f32 = fmul 0x3e5b4c8, 0x3d23258 [ORD=101] [ID=37]
0x3e5b4c8: f32,ch = CopyFromReg 0x2fe2f48, 0x2fe3be0 [ORD=101] [ID=25]
0x2fe2f48: ch = EntryToken [ORD=88] [ID=0]
0x2fe3be0: f32 = Register %reg1038 [ORD=101] [ID=6]
0x3d23258: f32 = ConstantFP<-2.000000e+000> [ORD=101] [ID=7]
0x2fe3690: f32 = fadd 0x3d23a50, 0x2fe3cf0 [ORD=100] [ID=53]
0x3d23a50: f32 = fadd 0x3d23500, 0x2fe3608 [ORD=96] [ID=52]
0x3d23500: f32 = fmul 0x2fe3e00, 0x3e5c100 [ORD=92] [ID=47]
0x2fe3e00: f32 = GHAL3DISD::EXTRACTX 0x3d5ab58 [ID=40]
0x3d5ab58: v4f32 = fsub 0x3e5ae50, 0x3d5aad0 [ORD=89] [ID=33]
0x3e5ae50: v4f32,ch = CopyFromReg 0x2fe2f48, 0x3d5a828 [ORD=89] [ID=22]
0x2fe2f48: ch = EntryToken [ORD=88] [ID=0]
0x3d5a828: v4f32 = Register %reg1057 [ORD=89] [ID=3]
0x3d5aad0: v4f32,ch = CopyFromReg 0x2fe2f48, 0x3e5a878 [ORD=89] [ID=23]
0x2fe2f48: ch = EntryToken [ORD=88] [ID=0]
0x3e5a878: v4f32 = Register %reg1056 [ORD=89] [ID=4]
0x3e5c100: f32 = GHAL3DISD::EXTRACTX 0x3d5abe0 [ID=34]
0x3d5abe0: v4f32,ch = CopyFromReg 0x2fe2f48, 0x2fe3580 [ORD=91] [ID=24]
0x2fe2f48: ch = EntryToken [ORD=88] [ID=0]
0x2fe3580: v4f32 = Register %reg1048 [ORD=91] [ID=5]
0x2fe3608: f32 = fmul 0x3d232e0, 0x2fe3938 [ORD=95] [ID=48]
0x3d232e0: f32 = GHAL3DISD::EXTRACTY 0x3d5ab58 [ID=41]
0x3d5ab58: v4f32 = fsub 0x3e5ae50, 0x3d5aad0 [ORD=89] [ID=33]
0x3e5ae50: v4f32,ch = CopyFromReg 0x2fe2f48, 0x3d5a828 [ORD=89] [ID=22]
0x2fe2f48: ch = EntryToken [ORD=88] [ID=0]
0x3d5a828: v4f32 = Register %reg1057 [ORD=89] [ID=3]
0x3d5aad0: v4f32,ch = CopyFromReg 0x2fe2f48, 0x3e5a878 [ORD=89] [ID=23]
0x2fe2f48: ch = EntryToken [ORD=88] [ID=0]
0x3e5a878: v4f32 = Register %reg1056 [ORD=89] [ID=4]
0x2fe3938: f32 = GHAL3DISD::EXTRACTY 0x3d5abe0 [ID=35]
0x3d5abe0: v4f32,ch = CopyFromReg 0x2fe2f48, 0x2fe3580 [ORD=91] [ID=24]
0x2fe2f48: ch = EntryToken [ORD=88] [ID=0]
0x2fe3580: v4f32 = Register %reg1048 [ORD=91] [ID=5]
0x2fe3cf0: f32 = fmul 0x3d23610, 0x2fe37a0 [ORD=99] [ID=49]
0x3d23610: f32 = GHAL3DISD::EXTRACTZ 0x3d5ab58 [ID=42]
0x3d5ab58: v4f32 = fsub 0x3e5ae50, 0x3d5aad0 [ORD=89] [ID=33]
0x3e5ae50: v4f32,ch = CopyFromReg 0x2fe2f48, 0x3d5a828 [ORD=89] [ID=22]
0x2fe2f48: ch = EntryToken [ORD=88] [ID=0]
0x3d5a828: v4f32 = Register %reg1057 [ORD=89] [ID=3]
0x3d5aad0: v4f32,ch = CopyFromReg 0x2fe2f48, 0x3e5a878 [ORD=89] [ID=23]
0x2fe2f48: ch = EntryToken [ORD=88] [ID=0]
0x3e5a878: v4f32 = Register %reg1056 [ORD=89] [ID=4]
0x2fe37a0: f32 = GHAL3DISD::EXTRACTZ 0x3d5abe0 [ID=36]
0x3d5abe0: v4f32,ch = CopyFromReg 0x2fe2f48, 0x2fe3580 [ORD=91] [ID=24]
0x2fe2f48: ch = EntryToken [ORD=88] [ID=0]
0x2fe3580: v4f32 = Register %reg1048 [ORD=91] [ID=5]
0x3e5c430: f32,ch = CopyFromReg 0x2fe2f48, 0x3e5a6e0 [ORD=103] [ID=26]
0x2fe2f48: ch = EntryToken [ORD=88] [ID=0]
0x3e5a6e0: f32 = Register %reg1051 [ORD=103] [ID=8]
0x3e5ab20: f32,ch = CopyFromReg 0x2fe2f48, 0x3e5a988 [ORD=124] [ID=29]
0x2fe2f48: ch = EntryToken [ORD=88] [ID=0]
0x3e5a988: f32 = Register %reg1025 [ORD=124] [ID=11]
0x3d23f18: ch = setgt [ID=20]
0x3d233f0: i32 = Constant<-1> [ORD=88] [ID=2]
0x3e5ac30: i32 = Constant<1> [ID=17]
error: midlevel compiler failed build.

Any idea what this means?

Peter

6 posts / 0 nouveau(x)
Dernière contribution
Reportez-vous à notre Notice d'optimisation pour plus d'informations sur les choix et l'optimisation des performances dans les produits logiciels Intel.
Portrait de Brijender Bharti (Intel)

Its hard to know from the error log. If you can share the code that will be great. Or I will suggest start deleting the code from kernel to pin point to a small set of lines which causes this issue.

Portrait de peastman

Here is the source code for the kernel. This is part of OpenMM (https://simtk.org/home/openmm):

#define EXP native_exp
#define LOG native_log
#define RECIP native_recip
#define RSQRT native_rsqrt
#define SQRT native_sqrt
#define WORK_GROUP_SIZE 64

#define CONSTRAIN_VELOCITIES 1

/**
 * Enforce constraints on SHAKE clusters
 */

__kernel void applyShakeToHydrogens(int numClusters, float tol, __global const float4* restrict oldPos, __global float4* restrict posDelta, __global const int4* restrict clusterAtoms, __global const float4* restrict clusterParams) {
    int index = get_global_id(0);
    while (index < numClusters) {
        // Load the data for this cluster.

        int4 atoms = clusterAtoms[index];
        float4 params = clusterParams[index];
        float4 pos = oldPos[atoms.x];
        float4 xpi = posDelta[atoms.x];
        float4 pos1 = oldPos[atoms.y];
        float4 xpj1 = posDelta[atoms.y];
        float4 pos2 = {0.0f, 0.0f, 0.0f, 0.0f};
        float4 xpj2 = {0.0f, 0.0f, 0.0f, 0.0f};
        float invMassCentral = params.x;
        float avgMass = params.y;
        float d2 = params.z;
        float invMassPeripheral = params.w;
        if (atoms.z != -1) {
            pos2 = oldPos[atoms.z];
            xpj2 = posDelta[atoms.z];
        }
        float4 pos3 = {0.0f, 0.0f, 0.0f, 0.0f};
        float4 xpj3 = {0.0f, 0.0f, 0.0f, 0.0f};
        if (atoms.w != -1) {
            pos3 = oldPos[atoms.w];
            xpj3 = posDelta[atoms.w];
        }

        // Precompute quantities.

        float4 rij1 = pos-pos1;
        float4 rij2 = pos-pos2;
        float4 rij3 = pos-pos3;
        float rij1sq = rij1.x*rij1.x + rij1.y*rij1.y + rij1.z*rij1.z;
        float rij2sq = rij2.x*rij2.x + rij2.y*rij2.y + rij2.z*rij2.z;
        float rij3sq = rij3.x*rij3.x + rij3.y*rij3.y + rij3.z*rij3.z;
        float ld1 = d2-rij1sq;
        float ld2 = d2-rij2sq;
        float ld3 = d2-rij3sq;

        // Iterate until convergence.

        bool converged = false;
        int iteration = 0;
        while (iteration < 15 && !converged) {
            converged = true;
#ifdef CONSTRAIN_VELOCITIES
            float4 rpij = xpi-xpj1;
            float rrpr = rpij.x*rij1.x + rpij.y*rij1.y + rpij.z*rij1.z;
            float delta = -2.0f*avgMass*rrpr/rij1sq;
            float4 dr = rij1*delta;
            xpi.xyz += dr.xyz*invMassCentral;
            xpj1.xyz -= dr.xyz*invMassPeripheral;
            if (fabs(delta) > tol)
                converged = false;
            if (atoms.z != -1) {
                rpij = xpi-xpj2;
                rrpr = rpij.x*rij2.x + rpij.y*rij2.y + rpij.z*rij2.z;
                delta = -2.0f*avgMass*rrpr/rij2sq;
                dr = rij2*delta;
                xpi.xyz += dr.xyz*invMassCentral;
                xpj2.xyz -= dr.xyz*invMassPeripheral;
                if (fabs(delta) > tol)
                    converged = false;
            }
            if (atoms.w != -1) {
                rpij = xpi-xpj3;
                rrpr = rpij.x*rij3.x + rpij.y*rij3.y + rpij.z*rij3.z;
                delta = -2.0f*avgMass*rrpr/rij3sq;
                dr = rij3*delta;
                xpi.xyz += dr.xyz*invMassCentral;
                xpj3.xyz -= dr.xyz*invMassPeripheral;
                if (fabs(delta) > tol)
                    converged = false;
            }
#else
            float4 rpij = xpi-xpj1;
            float rpsqij = rpij.x*rpij.x + rpij.y*rpij.y + rpij.z*rpij.z;
            float rrpr = rij1.x*rpij.x + rij1.y*rpij.y + rij1.z*rpij.z;
            float diff = fabs(ld1-2.0f*rrpr-rpsqij) / (d2*tol);
            if (diff >= 1.0f) {
                float acor  = (ld1-2.0f*rrpr-rpsqij)*avgMass / (rrpr+rij1sq);
                float4 dr = rij1*acor;
                xpi.xyz += dr.xyz*invMassCentral;
                xpj1.xyz -= dr.xyz*invMassPeripheral;
                converged = false;
            }
            if (atoms.z != -1) {
                rpij.xyz = xpi.xyz-xpj2.xyz;
                rpsqij = rpij.x*rpij.x + rpij.y*rpij.y + rpij.z*rpij.z;
                rrpr = rij2.x*rpij.x + rij2.y*rpij.y + rij2.z*rpij.z;
                diff = fabs(ld2-2.0f*rrpr-rpsqij) / (d2*tol);
                if (diff >= 1.0f) {
                    float acor  = (ld2 - 2.0f*rrpr - rpsqij)*avgMass / (rrpr + rij2sq);
                    float4 dr = rij2*acor;
                    xpi.xyz += dr.xyz*invMassCentral;
                    xpj2.xyz -= dr.xyz*invMassPeripheral;
                    converged = false;
                }
            }
            if (atoms.w != -1) {
                rpij.xyz = xpi.xyz-xpj3.xyz;
                rpsqij = rpij.x*rpij.x + rpij.y*rpij.y + rpij.z*rpij.z;
                rrpr = rij3.x*rpij.x + rij3.y*rpij.y + rij3.z*rpij.z;
                diff = fabs(ld3 - 2.0f*rrpr - rpsqij) / (d2*tol);
                if (diff >= 1.0f) {
                    float acor  = (ld3-2.0f*rrpr-rpsqij)*avgMass / (rrpr+rij3sq);
                    float4 dr = rij3*acor;
                    xpi.xyz += dr.xyz*invMassCentral;
                    xpj3.xyz -= dr.xyz*invMassPeripheral;
                    converged = false;
                }
            }
#endif
            iteration++;
        }

        // Record the new positions.

        posDelta[atoms.x] = xpi;
        posDelta[atoms.y] = xpj1;
        if (atoms.z != -1)
            posDelta[atoms.z] = xpj2;
        if (atoms.w != -1)
            posDelta[atoms.w] = xpj3;
        index += get_global_size(0);
    }
}

Peter

Portrait de Raghu Muthyalampalli (Intel)

Hi Peter,

Thanks for providing the kernel. I am able to reproduce the build error on the GPU using the 2012SDK. I'll get back to you on my findings after debug it more.

Thanks,
Raghu

Portrait de Raghu Muthyalampalli (Intel)

I was able to narrow down the test case to a few lines causing the build to fail. I have filed a bug and will let you know if development suggests a fix or a workaround.

Thanks,
Raghu

Portrait de peastman

Great, thank you for the update.

Peter

Connectez-vous pour laisser un commentaire.