Skip to content

Instantly share code, notes, and snippets.

@chick
Created December 30, 2016 22:30
Show Gist options
  • Save chick/9407ef6e004dcb2c5de09e5e26b95b6a to your computer and use it in GitHub Desktop.
Save chick/9407ef6e004dcb2c5de09e5e26b95b6a to your computer and use it in GitHub Desktop.
Simple OpenCL kernel for 7pt operator
// <file: control.c>
#include <stdio.h>
#include <time.h>
#ifdef __APPLE__
#include <OpenCL/opencl.h>
#else
#include <CL/cl.h>
#endif
int control(cl_command_queue queue, cl_kernel kernel_0, cl_mem out, cl_mem mesh) {
int error_code = 0;
size_t global_512_512 [2] = {512, 512};
size_t local_16_32 [2] = {16, 32};
error_code |= clSetKernelArg(kernel_0, 0, 8, & out);
error_code |= clSetKernelArg(kernel_0, 1, 8, & mesh);
error_code |= clEnqueueNDRangeKernel(queue, kernel_0, 2, NULL, global_512_512 , local_16_32 , 0, NULL, NULL);
clFinish(queue);
if (error_code != 0) printf("error code %d\n", error_code);
return error_code;
};
)))
INFO:ctree.c.nodes:compilation command: gcc -shared -fPIC -std=c99 -O2 -framework OpenCL -o /var/folders/ls/0xl3wjy949b2b9j36yn36v3c0000gn/T/tmpEfglgb/snowflake/5598646763549957670/_5486821843753659896/_7985492147856592190/PencilCompiler/control.so /var/folders/ls/0xl3wjy949b2b9j36yn36v3c0000gn/T/tmpEfglgb/snowflake/5598646763549957670/_5486821843753659896/_7985492147856592190/PencilCompiler/control.c -framework OpenCL
INFO:ctree.nodes:Recreating source
INFO:ctree.nodes:file for generated OpenCL: /var/folders/ls/0xl3wjy949b2b9j36yn36v3c0000gn/T/tmpEfglgb/snowflake/5598646763549957670/_5486821843753659896/_7985492147856592190/PencilCompiler/kernel_0.cl
INFO:ctree.nodes:generated OpenCL code: (((
// <file: kernel_0.cl>
#define encode514_514_514(x0, x1, x2) (264196 * (long) (x0) + 514 * (long) (x1) + 1 * (long) (x2))
#define encode514_514(x0, x1) (514 * (long) (x0) + 1 * (long) (x1))
#define encode18_34(x0, x1) (34 * (long) (x0) + 1 * (long) (x1))
__kernel void kernel_0(__global float* out, __global float* mesh) {
__local float local_buf_0[612];
__local float local_buf_1[612];
__local float local_buf_2[612];
__local float* plane_0 = local_buf_0;
__local float* plane_1 = local_buf_1;
__local float* plane_2 = local_buf_2;
__local float* temp_plane;
size_t tile_id_1 = get_group_id(0);
size_t tile_id_2 = get_group_id(1);
size_t packed_global_id_1 = get_global_id(0);
size_t packed_global_id_2 = get_global_id(1);
size_t packed_local_id_1 = get_local_id(0);
size_t packed_local_id_2 = get_local_id(1);
size_t thread_id = packed_local_id_1 * 32 + packed_local_id_2;
size_t group_id_0 = get_group_id(0);
size_t group_id_1 = get_group_id(1);
size_t index_0;
size_t index_1;
size_t index_2;
size_t local_index_0;
size_t local_index_1;
size_t local_index_2;
size_t dim_0_offsets[] = {1};
size_t dim_0_strides[] = {1};
size_t dim_1_offsets[] = {1};
size_t dim_1_strides[] = {1};
index_1 = (packed_global_id_1 % 512) * dim_1_strides[(packed_global_id_1 / 512)] + dim_1_offsets[(packed_global_id_1 / 512)];
local_index_1 = (packed_local_id_1 % 512) * dim_1_strides[(packed_global_id_1 / 512)] + dim_1_offsets[(packed_global_id_1 / 512)];
size_t dim_2_offsets[] = {1};
size_t dim_2_strides[] = {1};
index_2 = (packed_global_id_2 % 512) * dim_2_strides[(packed_global_id_2 / 512)] + dim_2_offsets[(packed_global_id_2 / 512)];
local_index_2 = (packed_local_id_2 % 512) * dim_2_strides[(packed_global_id_2 / 512)] + dim_2_offsets[(packed_global_id_2 / 512)];
//
// Fill the first local memory planes
//
plane_1[(thread_id + 0)] = mesh[encode514_514_514(0, tile_id_1 * 16 + ((thread_id + 0) / 34), tile_id_2 * 32 + ((thread_id + 0) % 34))];
if (thread_id + 512 < 612) {
plane_1[(thread_id + 512)] = mesh[encode514_514_514(0, tile_id_1 * 16 + ((thread_id + 512) / 34), tile_id_2 * 32 + ((thread_id + 512) % 34))];
};
plane_2[(thread_id + 0)] = mesh[encode514_514_514(1, tile_id_1 * 16 + ((thread_id + 0) / 34), tile_id_2 * 32 + ((thread_id + 0) % 34))];
if (thread_id + 512 < 612) {
plane_2[(thread_id + 512)] = mesh[encode514_514_514(1, tile_id_1 * 16 + ((thread_id + 512) / 34), tile_id_2 * 32 + ((thread_id + 512) % 34))];
};
barrier(CLK_LOCAL_MEM_FENCE);
for (index_0 = 1; index_0 <= 512; index_0 ++) {
temp_plane = plane_0;
plane_0 = plane_1;
plane_1 = plane_2;
plane_2 = temp_plane;
plane_2[(thread_id + 0)] = mesh[encode514_514_514(index_0 + 1, tile_id_1 * 16 + ((thread_id + 0) / 34), tile_id_2 * 32 + ((thread_id + 0) % 34))];
if (thread_id + 512 < 612) {
plane_2[(thread_id + 512)] = mesh[encode514_514_514(index_0 + 1, tile_id_1 * 16 + ((thread_id + 512) / 34), tile_id_2 * 32 + ((thread_id + 512) % 34))];
};
barrier(CLK_LOCAL_MEM_FENCE);
out[encode514_514_514(index_0, index_1, index_2)] = plane_0[encode18_34(local_index_1 + 0, local_index_2 + 0)] + plane_1[encode18_34(local_index_1 + -1, local_index_2 + 0)] + plane_1[encode18_34(local_index_1 + 0, local_index_2 + -1)] + plane_1[encode18_34(local_index_1 + 0, local_index_2 + 0)] * -6 + plane_1[encode18_34(local_index_1 + 0, local_index_2 + 1)] + plane_1[encode18_34(local_index_1 + 1, local_index_2 + 0)] + plane_2[encode18_34(local_index_1 + 0, local_index_2 + 0)];
barrier(CLK_LOCAL_MEM_FENCE);
};
};
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment