What's the minimum parallel granularity in OpenCL?

What's the minimum parallel granularity in OpenCL?

Hi, everybody:

    I want to know how many work-items will be executed  simultaneously in OpenCL. Just like the warp in CUDA. I wrote some code to test Nvidia GPU and Xeon Phi Coprocessor and the result is 32 and 8192 respectively.

Kernel code:

__kernel void query_min_granularity(int n_loop, __global int* flag, __global int* output) {
	int tid = get_global_id(0);
	int total_items = get_global_size(0);

	int half_index = total_items / 2;

	if (tid < half_index) {
		for (int i = 0; i < n_loop; i++) {
			if (*flag == 1)
				break;
		}
		if (*flag != 1) {
			if (tid == 0) {
				*output = 1;
			}
		} else {
			if (tid == 0) {
				*output = 0;
			}
		}
	} else {
		if (tid == half_index) {
			*flag = 1;
		}
	}
}

Host code:

#include <stdlib.h>
#include <stdio.h>
#include <iostream>
#include <CL/cl.h>

using namespace std;

#define OPENCL_CHECK_ERRORS(ERR)					\
		if(ERR != CL_SUCCESS) {						\
			cerr									\
			<< "OpenCL error with code " << ERR		\
			<< " happened in file " << __FILE__		\
			<< " at line " << __LINE__				\
			<< ". Exiting...\n";					\
			exit(1);								\
		}

int main() {
	cl_int err = CL_SUCCESS;

	cl_platform_id platform;
	err = clGetPlatformIDs(1, &platform, 0);
	OPENCL_CHECK_ERRORS(err);

	cl_device_id device;
	err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ACCELERATOR, 1, &device, 0);
	OPENCL_CHECK_ERRORS(err);

	cl_context context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
	OPENCL_CHECK_ERRORS(err);

	cl_command_queue queue = clCreateCommandQueue(context, device, 0, &err);
	OPENCL_CHECK_ERRORS(err);

	FILE *fp = fopen("min_pgran.cl", "rb");
	if (fp == NULL) {
		printf("The kernel file not found!");
		exit(-1);
	}
	fseek(fp, 0, SEEK_END);
	size_t file_size = ftell(fp);
	fseek(fp, 0, SEEK_SET);
	char* source = (char*) malloc(file_size + 1);
	fread(source, 1, file_size, fp);
	source[file_size] = '\0';
	fclose(fp);

	cl_program program = clCreateProgramWithSource(context, 1,
			(const char**) &source, &file_size, &err);
	OPENCL_CHECK_ERRORS(err);
	err = clBuildProgram(program, 1, &device, NULL, NULL, NULL);
	OPENCL_CHECK_ERRORS(err);

	cl_kernel kernel = clCreateKernel(program, "query_min_granularity", &err);
	OPENCL_CHECK_ERRORS(err);

	size_t max_group_size;
	clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE,
			sizeof(max_group_size), &max_group_size, NULL);
	unsigned int min_gran = max_group_size;

	while (min_gran > 0) {
		cl_int cl_input = 1000;
		err = clSetKernelArg(kernel, 0, sizeof(cl_int), &cl_input);
		OPENCL_CHECK_ERRORS(err);
		int flag = -1;
		cl_mem cl_flag = clCreateBuffer(context,
				CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(int), &flag,
				&err);
		OPENCL_CHECK_ERRORS(err);
		err = clSetKernelArg(kernel, 1, sizeof(cl_mem), &cl_flag);
		OPENCL_CHECK_ERRORS(err);

		int output = -1;
		cl_mem cl_output = clCreateBuffer(context,
				CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(int), &output,
				&err);
		OPENCL_CHECK_ERRORS(err);
		err = clSetKernelArg(kernel, 2, sizeof(cl_mem), &cl_output);
		OPENCL_CHECK_ERRORS(err);

		size_t global_work_size[1] = { min_gran };
		size_t local_work_size[1] = { min_gran };
		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, global_work_size,
				local_work_size, 0, NULL, NULL);
		OPENCL_CHECK_ERRORS(err);
		clFinish(queue);

		err = clEnqueueReadBuffer(queue, cl_output, CL_TRUE, 0, sizeof(cl_int),
				&output, 0, NULL, NULL);
		OPENCL_CHECK_ERRORS(err);
		clReleaseMemObject(cl_flag);
		clReleaseMemObject(cl_output);

		if (output == CL_TRUE) {
			break;
		}

		min_gran /= 2;
	}

	clReleaseKernel(kernel);
	clReleaseCommandQueue(queue);
clReleaseContext(context);

	printf("The maximum group size is: %d\n", max_group_size);
	printf("The minimum granularity is: %d\n", min_gran);
	return 0;
}

 

Is the result correct? If not, please tell me the right method to test it.

Best regards,

Capar

2 posts / 0 new
Last post
For more complete information about compiler optimizations, see our Optimization Notice.

Hi Caper,

8192 is indeed the max total local size on Xeon Phi. But this size by itself doesn't indicate the level of paralelism.

Please note that the Xeon Phi coprocessor is a set of x86 processors. On Xeon Phi, a WG is implemented as a routine that loops over the work-items. The body of the loop is the kernel. On top of that, this routine is automatically vectorized (to 16). So each 16 work-items of the same WG are executed in parallel on the same logical core. The minimum WG size required to allow vectorization is 16 (in dimension zero).

Whole WGs are scheduled to execute on the logical cores, so WGs do execute in parallel. Assuming Xeon Phi with 60 physical cores. Each core hosts 4 logical cores (hw threads). The number of WGs that can execute in parallel is 240 in this case. However, we recommend submitting NDRange tasks with at least 1000 WGs to allow efficient load-balancing.

For more reading, please take a look at the "training" tab : https://software.intel.com/en-us/vcsource/tools/opencl-sdk-xe

I hope that this helps.

Arik

Leave a Comment

Please sign in to add a comment. Not a member? Join today