Skip to content

Instantly share code, notes, and snippets.

What would you like to do?
#include <CL/cl.hpp>
#include <iostream>
using namespace std;
const char *SOURCE = R"RAW(
#define local_barrier() barrier(CLK_LOCAL_MEM_FENCE);
#define WITHIN_KERNEL /* empty */
#define KERNEL __kernel
#define GLOBAL_MEM __global
#define LOCAL_MEM __local
#define LOCAL_MEM_ARG __local
#define REQD_WG_SIZE(X,Y,psc_Z) __attribute__((reqd_work_group_size(X, Y, psc_Z)))
#define psc_LID_0 get_local_id(0)
#define psc_LID_1 get_local_id(1)
#define psc_LID_2 get_local_id(2)
#define psc_GID_0 get_group_id(0)
#define psc_GID_1 get_group_id(1)
#define psc_GID_2 get_group_id(2)
#define psc_LDIM_0 get_local_size(0)
#define psc_LDIM_1 get_local_size(1)
#define psc_LDIM_2 get_local_size(2)
#define psc_GDIM_0 get_num_groups(0)
#define psc_GDIM_1 get_num_groups(1)
#define psc_GDIM_2 get_num_groups(2)
#if __OPENCL_C_VERSION__ < 120
#pragma OPENCL EXTENSION cl_khr_fp64: enable
#define psc_WG_SIZE 16
#define psc_SCAN_EXPR(a, b, across_seg_boundary) a+b
#define psc_INPUT_EXPR(i) (input_ary[i])
typedef int psc_scan_type;
typedef int psc_index_type;
// NO_SEG_BOUNDARY is the largest representable integer in psc_index_type.
// This assumption is used in code below.
#define NO_SEG_BOUNDARY 2147483647
#define psc_K 256
void scan_scan_intervals_lev1(
__global int *input_ary, __global int *output_ary,
GLOBAL_MEM psc_scan_type *restrict psc_partial_scan_buffer,
const psc_index_type N,
const psc_index_type psc_interval_size
, GLOBAL_MEM psc_scan_type *restrict psc_interval_results
// index psc_K in first dimension used for psc_carry storage
struct psc_wrapped_scan_type
psc_scan_type psc_value;
// padded in psc_WG_SIZE to avoid bank conflicts
LOCAL_MEM struct psc_wrapped_scan_type psc_ldata[psc_WG_SIZE];
for(int i = 0; i < 10; ++i)
psc_scan_type psc_val = 0;
if (psc_LID_0 >= 2)
psc_scan_type psc_tmp = psc_ldata[psc_LID_0 - 2].psc_value;
psc_val = psc_tmp+ psc_val;
// {{{ writes to local allowed, reads from local not allowed
psc_ldata[psc_LID_0].psc_value = psc_val;
int main(int argc, char *argv[])
cl::Device device = cl::Device::getDefault();
cl::CommandQueue queue = cl::CommandQueue::getDefault();
cl::Program program(SOURCE, true);
cout << device.getInfo<CL_DEVICE_NAME>() << endl;
auto kernel = cl::make_kernel<cl::Buffer, cl::Buffer, cl::Buffer, cl_int, cl_int, cl::Buffer>(program, "scan_scan_intervals_lev1");
cl_int i;
cl::Buffer buffer;
kernel(cl::EnqueueArgs(queue, cl::NDRange(16), cl::NDRange(16)),
buffer, buffer, buffer, i, i, buffer);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment