pavanky/fft2.cpp

## fft2.cpp
#include <stdio.h>
#include <clFFT.h>

#define ERR(str, status) do {                   \
        printf("%s(%d):"str,                    \
               __FILE__, __LINE__, status);     \
        return status;                          \
    } while(0)

#define CLFFT(fn) do {                          \
        clfftStatus _err = fn;                  \
        if (_err != CLFFT_SUCCESS) {            \
            ERR("clFFT Error : %d\n", _err);    \
        }                                       \
    } while(0)

// for ocl calls that return errors
#define OCL(call) do {                          \
        cl_int _err = (call);                   \
        if ( _err != CL_SUCCESS) {              \
            ERR("OpenCL Error: %d\n", _err);    \
        }                                       \
    } while(0)


#define CHECK(call) do {                        \
        cl_int _err_ = CL_SUCCESS;              \
        call;                                   \
        OCL(_err_);                             \
    } while(0)


cl_context context = 0;
cl_device_id device = 0;
cl_platform_id platform = 0;
cl_command_queue queue = 0;

class _clfft_init
{
public:
    _clfft_init()
        {
            // Initialize flags for FFT library
            clfftSetupData* setupData = new clfftSetupData();
            clfftInitSetupData(setupData);
            clfftSetup(setupData);
        }
    ~_clfft_init()
        {
            clfftTeardown();
        }
};

static void clfft_init()
{
    static _clfft_init _fft = _clfft_init();
    return;
}

int init()
{
    OCL(clGetPlatformIDs(1, &platform, NULL));
    OCL(clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU,
                       1, &device, NULL));

    // Create context
    cl_context_properties props[] = {
        CL_CONTEXT_PLATFORM, (cl_context_properties)platform,
        0
    };
    CHECK(context = clCreateContext(props, 1, &device, NULL, NULL, &_err_));
    CHECK(queue = clCreateCommandQueue(context, device, 0, &_err_));

    size_t max_const_mem = 0;
    OCL(clGetDeviceInfo(device, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE,
                        sizeof(size_t), &max_const_mem, NULL));

    printf("Const memory size: %zu\n", max_const_mem);

    clfft_init();

    return CL_SUCCESS;
}

int main()
{
    OCL(init());

    clfftLayout layout = CLFFT_COMPLEX_INTERLEAVED;
    clfftPrecision pres = CLFFT_SINGLE;
    clfftResultLocation loc = CLFFT_INPLACE;
    clfftDim dim = CLFFT_2D;
    clfftDirection dir = CLFFT_FORWARD;
    cl_float scale = 1.0;
    unsigned batch = 1;
    cl_mem d_idata;
    cl_mem d_odata;

    for(int n = 256; n <= 2048; n*=2) {
        printf("%4d, %4d\n", n, n);

        size_t lengths[3] = {n, n, 1};
        size_t strides[4] = {1, 1, 1, 1};
        for (int  i = 1; i <= 3; i++) strides[i] = strides[i - 1] * lengths[i - 1];
        size_t numel = strides[3];

        CHECK(d_odata = clCreateBuffer(context, CL_MEM_READ_WRITE,
                                       n * n * sizeof(cl_float2),
                                       NULL, &_err_));

        CHECK(d_idata = clCreateBuffer(context, CL_MEM_READ_WRITE,
                                       n * n * sizeof(cl_float2),
                                       NULL, &_err_));

        clfftPlanHandle plan;
        CLFFT(clfftCreateDefaultPlan(&plan, context, dim, lengths));
        CLFFT(clfftSetPlanPrecision(plan, pres));
        CLFFT(clfftSetPlanScale(plan, dir, scale));
        CLFFT(clfftSetPlanBatchSize(plan, batch));
        CLFFT(clfftSetPlanDim(plan, dim));
        CLFFT(clfftSetPlanLength(plan, dim, lengths));
        CLFFT(clfftSetPlanInStride(plan, dim, strides));
        CLFFT(clfftSetPlanOutStride(plan, dim, strides));
        CLFFT(clfftSetPlanDistance(plan, numel, numel));
        CLFFT(clfftSetLayout(plan, layout, layout));
        CLFFT(clfftSetResultLocation(plan, loc));
        CLFFT(clfftBakePlan(plan, 1, &queue, NULL, NULL));
        OCL(clFinish(queue));

        // Perform the transform
        CLFFT(clfftEnqueueTransform(plan, dir, 1, &queue, 0, NULL, NULL,
                                    &d_idata, NULL, NULL));

        OCL(clFinish(queue));

        CLFFT(clfftDestroyPlan(&plan));
        OCL(clReleaseMemObject(d_idata));
        OCL(clReleaseMemObject(d_odata));
    }

    return 0;
}

## trans.cl
#define HSTRIDE 1024
#define VSTRIDE 1024
#define DIM 32

__attribute__((reqd_work_group_size(64,1,1)))
__kernel void
fft_trans(__global float2 * gcomplx)
{
    __local float2 ldsa[1024];
    __local float2 ldsb[1024];
    uint gid = get_global_id(0);
    uint me = gid & 0x3fU;
    uint k = (gid >> 6) % 528;

    // Compute location of blocks
    int l = DIM+0.5f - native_sqrt((DIM+0.5f)*(DIM+0.5f) - 2.0f * (float)as_int(k));
    int kl = ((DIM*2+1 - l) * l) >> 1;
    uint j = k - kl;
    uint i = l + j;

    uint goa, gob;
    uint go = ((me & 0x7U) << 2) + ((gid>>6)/528) * VSTRIDE * HSTRIDE;
    __global float4 *gp;
    __local float4 *lp4;
    uint lo = ((me & 0x7U) << 7) + (me >> 3);
    uint lot = (me<<2);
    __local float2 *lp;
    float4 z00, z01, z10, z11, z20, z21, z30, z31;

    // Array offsets
    goa = go + (i << 5) + j * (HSTRIDE*32) + (me >> 3)*HSTRIDE;

    // Load A block
    gp = (__global float4 *)(gcomplx + goa);
    z00 = gp[0*HSTRIDE/4*16];
    z01 = gp[0*HSTRIDE/4*16 + 1];
    z10 = gp[1*HSTRIDE/4*16];
    z11 = gp[1*HSTRIDE/4*16 + 1];
    z20 = gp[2*HSTRIDE/4*16];
    z21 = gp[2*HSTRIDE/4*16 + 1];
    z30 = gp[3*HSTRIDE/4*16];
    z31 = gp[3*HSTRIDE/4*16 + 1];

    // Save into LDS
    lp = ldsa + lo;
    lp[0*8] = z00.xy;
    lp[1*8] = z10.xy;
    lp[2*8] = z20.xy;
    lp[3*8] = z30.xy;
    lp += 8*4;

    lp[0*8] = z00.zw;
    lp[1*8] = z10.zw;
    lp[2*8] = z20.zw;
    lp[3*8] = z30.zw;
    lp += 8*4;

    lp[0*8] = z01.xy;
    lp[1*8] = z11.xy;
    lp[2*8] = z21.xy;
    lp[3*8] = z31.xy;
    lp += 8*4;

    lp[0*8] = z01.zw;
    lp[1*8] = z11.zw;
    lp[2*8] = z21.zw;
    lp[3*8] = z31.zw;
    lp += 8*4;

    //End load A block

    //  Load B block
    gob = go + (j << 5) + i * (HSTRIDE*32) + (me >> 3)*HSTRIDE;
    gp = (__global float4 *)(gcomplx + gob);
    z00 = gp[0*HSTRIDE/4*16];
    z01 = gp[0*HSTRIDE/4*16 + 1];
    z10 = gp[1*HSTRIDE/4*16];
    z11 = gp[1*HSTRIDE/4*16 + 1];
    z20 = gp[2*HSTRIDE/4*16];
    z21 = gp[2*HSTRIDE/4*16 + 1];
    z30 = gp[3*HSTRIDE/4*16];
    z31 = gp[3*HSTRIDE/4*16 + 1];

    // Save into LDS
    lp = ldsb + lo;
    lp[0*8] = z00.xy;
    lp[1*8] = z10.xy;
    lp[2*8] = z20.xy;
    lp[3*8] = z30.xy;
    lp += 8*4;

    lp[0*8] = z00.zw;
    lp[1*8] = z10.zw;
    lp[2*8] = z20.zw;
    lp[3*8] = z30.zw;
    lp += 8*4;

    lp[0*8] = z01.xy;
    lp[1*8] = z11.xy;
    lp[2*8] = z21.xy;
    lp[3*8] = z31.xy;
    lp += 8*4;

    lp[0*8] = z01.zw;
    lp[1*8] = z11.zw;
    lp[2*8] = z21.zw;
    lp[3*8] = z31.zw;
    lp += 8*4;

    // End load B block

    barrier(CLK_LOCAL_MEM_FENCE);

    // write A block
    goa = go + (i << 5) + j * (VSTRIDE*32) + (me >> 3)*VSTRIDE;
    gp = (__global float4 *)(gcomplx + goa);
    lp4 = (__local float4 *)(ldsb + lot);
    z00 = lp4[0];
    z01 = lp4[1];
    lp4 += 32*4;

    z10 = lp4[0];
    z11 = lp4[1];
    lp4 += 32*4;

    z20 = lp4[0];
    z21 = lp4[1];
    lp4 += 32*4;

    z30 = lp4[0];
    z31 = lp4[1];

    gp[0*VSTRIDE/4*16] = z00;
    gp[0*VSTRIDE/4*16+1] = z01;
    gp[1*VSTRIDE/4*16] = z10;
    gp[1*VSTRIDE/4*16+1] = z11;
    gp[2*VSTRIDE/4*16] = z20;
    gp[2*VSTRIDE/4*16+1] = z21;
    gp[3*VSTRIDE/4*16] = z30;
    gp[3*VSTRIDE/4*16+1] = z31;

    // End write A block;

    // write B block

    gob = go + (j << 5) + i * (VSTRIDE*32) + (me >> 3)*VSTRIDE;
    gp = (__global float4 *)(gcomplx + gob);
    lp4 = (__local float4 *)(ldsa + lot);

    z00 = lp4[0];
    z01 = lp4[1];
    lp4 += 32*4;

    z10 = lp4[0];
    z11 = lp4[1];
    lp4 += 32*4;

    z20 = lp4[0];
    z21 = lp4[1];
    lp4 += 32*4;

    z30 = lp4[0];
    z31 = lp4[1];

    gp[0*VSTRIDE/4*16] = z00;
    gp[0*VSTRIDE/4*16+1] = z01;
    gp[1*VSTRIDE/4*16] = z10;
    gp[1*VSTRIDE/4*16+1] = z11;
    gp[2*VSTRIDE/4*16] = z20;
    gp[2*VSTRIDE/4*16+1] = z21;
    gp[3*VSTRIDE/4*16] = z30;
    gp[3*VSTRIDE/4*16+1] = z31;

    // End write B block;

}

## trans.cpp
#include <stdio.h>
#include <CL/cl.h>
#include <iostream>
#include <fstream>

#define ERR(str, status) do {                   \
        printf("%s(%d):"str,                    \
               __FILE__, __LINE__, status);     \
        return status;                          \
    } while(0)

// for ocl calls that return errors
#define OCL(call) do {                          \
        cl_int _err = (call);                   \
        if ( _err != CL_SUCCESS) {              \
            ERR("OpenCL Error: %d\n", _err);    \
        }                                       \
    } while(0)


#define CHECK(call) do {                        \
        cl_int _err_ = CL_SUCCESS;              \
        call;                                   \
        OCL(_err_);                             \
    } while(0)


cl_context context = 0;
cl_device_id device = 0;
cl_platform_id platform = 0;
cl_command_queue queue = 0;

int init()
{
    OCL(clGetPlatformIDs(1, &platform, NULL));
    OCL(clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU,
                       1, &device, NULL));

    // Create context
    cl_context_properties props[] = {
        CL_CONTEXT_PLATFORM, (cl_context_properties)platform,
        0
    };
    CHECK(context = clCreateContext(props, 1, &device, NULL, NULL, &_err_));
    CHECK(queue = clCreateCommandQueue(context, device, 0, &_err_));

    size_t max_const_mem = 0;
    OCL(clGetDeviceInfo(device, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE,
                        sizeof(size_t), &max_const_mem, NULL));

    printf("Const memory size: %zu\n", max_const_mem);

    return CL_SUCCESS;
}

int read_kernel(char **buffer, const char *file_name)
{
    std::ifstream ifs;
    ifs.open(file_name, std::ifstream::in);

    ifs.seekg(0, ifs.end);
    int length = ifs.tellg();
    ifs.seekg(0, ifs.beg);

    *buffer = new char[length];
    ifs.read(*buffer, length);
    ifs.close();

    return length;
}

int main()
{
    OCL(init());

    char *trans_str = NULL;
    size_t length = (size_t)read_kernel(&trans_str, "trans.cl");

    int n = 1024;
    size_t sz = n * n * sizeof(cl_float2);

    cl_program program;
    cl_kernel kernel;
    cl_mem d_data = 0;

    CHECK(d_data = clCreateBuffer(context, CL_MEM_READ_WRITE, sz, NULL, &_err_););
    CHECK(program = clCreateProgramWithSource(context, 1,
                                              (const char **)&trans_str,
                                              &length, &_err_); );

    OCL(clBuildProgram(program, 0, NULL, "", NULL, NULL));
    CHECK(kernel = clCreateKernel(program, "fft_trans", &_err_););

    size_t local[] = {64, 1};
    size_t global[] = {528 * local[0], 1 * local[1]};

    OCL(clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_data));
    OCL(clEnqueueNDRangeKernel(queue, kernel, 1,
                               NULL, global, local, 0, NULL, NULL));

    OCL(clFinish(queue));

    if (d_data) OCL(clReleaseMemObject(d_data));
    delete[] trans_str;
    return 0;
}
	#include <stdio.h>
	#include <clFFT.h>

	#define ERR(str, status) do { \
	printf("%s(%d):"str, \
	__FILE__, __LINE__, status); \
	return status; \
	} while(0)

	#define CLFFT(fn) do { \
	clfftStatus _err = fn; \
	if (_err != CLFFT_SUCCESS) { \
	ERR("clFFT Error : %d\n", _err); \
	} \
	} while(0)

	// for ocl calls that return errors
	#define OCL(call) do { \
	cl_int _err = (call); \
	if ( _err != CL_SUCCESS) { \
	ERR("OpenCL Error: %d\n", _err); \
	} \
	} while(0)


	#define CHECK(call) do { \
	cl_int _err_ = CL_SUCCESS; \
	call; \
	OCL(_err_); \
	} while(0)


	cl_context context = 0;
	cl_device_id device = 0;
	cl_platform_id platform = 0;
	cl_command_queue queue = 0;

	class _clfft_init
	{
	public:
	_clfft_init()
	{
	// Initialize flags for FFT library
	clfftSetupData* setupData = new clfftSetupData();
	clfftInitSetupData(setupData);
	clfftSetup(setupData);
	}
	~_clfft_init()
	{
	clfftTeardown();
	}
	};

	static void clfft_init()
	{
	static _clfft_init _fft = _clfft_init();
	return;
	}

	int init()
	{
	OCL(clGetPlatformIDs(1, &platform, NULL));
	OCL(clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU,
	1, &device, NULL));

	// Create context
	cl_context_properties props[] = {
	CL_CONTEXT_PLATFORM, (cl_context_properties)platform,
	0
	};
	CHECK(context = clCreateContext(props, 1, &device, NULL, NULL, &_err_));
	CHECK(queue = clCreateCommandQueue(context, device, 0, &_err_));

	size_t max_const_mem = 0;
	OCL(clGetDeviceInfo(device, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE,
	sizeof(size_t), &max_const_mem, NULL));

	printf("Const memory size: %zu\n", max_const_mem);

	clfft_init();

	return CL_SUCCESS;
	}

	int main()
	{
	OCL(init());

	clfftLayout layout = CLFFT_COMPLEX_INTERLEAVED;
	clfftPrecision pres = CLFFT_SINGLE;
	clfftResultLocation loc = CLFFT_INPLACE;
	clfftDim dim = CLFFT_2D;
	clfftDirection dir = CLFFT_FORWARD;
	cl_float scale = 1.0;
	unsigned batch = 1;
	cl_mem d_idata;
	cl_mem d_odata;

	for(int n = 256; n <= 2048; n*=2) {
	printf("%4d, %4d\n", n, n);

	size_t lengths[3] = {n, n, 1};
	size_t strides[4] = {1, 1, 1, 1};
	for (int i = 1; i <= 3; i++) strides[i] = strides[i - 1] * lengths[i - 1];
	size_t numel = strides[3];

	CHECK(d_odata = clCreateBuffer(context, CL_MEM_READ_WRITE,
	n * n * sizeof(cl_float2),
	NULL, &_err_));

	CHECK(d_idata = clCreateBuffer(context, CL_MEM_READ_WRITE,
	n * n * sizeof(cl_float2),
	NULL, &_err_));

	clfftPlanHandle plan;
	CLFFT(clfftCreateDefaultPlan(&plan, context, dim, lengths));
	CLFFT(clfftSetPlanPrecision(plan, pres));
	CLFFT(clfftSetPlanScale(plan, dir, scale));
	CLFFT(clfftSetPlanBatchSize(plan, batch));
	CLFFT(clfftSetPlanDim(plan, dim));
	CLFFT(clfftSetPlanLength(plan, dim, lengths));
	CLFFT(clfftSetPlanInStride(plan, dim, strides));
	CLFFT(clfftSetPlanOutStride(plan, dim, strides));
	CLFFT(clfftSetPlanDistance(plan, numel, numel));
	CLFFT(clfftSetLayout(plan, layout, layout));
	CLFFT(clfftSetResultLocation(plan, loc));
	CLFFT(clfftBakePlan(plan, 1, &queue, NULL, NULL));
	OCL(clFinish(queue));

	// Perform the transform
	CLFFT(clfftEnqueueTransform(plan, dir, 1, &queue, 0, NULL, NULL,
	&d_idata, NULL, NULL));

	OCL(clFinish(queue));

	CLFFT(clfftDestroyPlan(&plan));
	OCL(clReleaseMemObject(d_idata));
	OCL(clReleaseMemObject(d_odata));
	}

	return 0;
	}
	#define HSTRIDE 1024
	#define VSTRIDE 1024
	#define DIM 32

	__attribute__((reqd_work_group_size(64,1,1)))
	__kernel void
	fft_trans(__global float2 * gcomplx)
	{
	__local float2 ldsa[1024];
	__local float2 ldsb[1024];
	uint gid = get_global_id(0);
	uint me = gid & 0x3fU;
	uint k = (gid >> 6) % 528;

	// Compute location of blocks
	int l = DIM+0.5f - native_sqrt((DIM+0.5f)(DIM+0.5f) - 2.0f (float)as_int(k));
	int kl = ((DIM2+1 - l) l) >> 1;
	uint j = k - kl;
	uint i = l + j;

	uint goa, gob;
	uint go = ((me & 0x7U) << 2) + ((gid>>6)/528) * VSTRIDE * HSTRIDE;
	__global float4 *gp;
	__local float4 *lp4;
	uint lo = ((me & 0x7U) << 7) + (me >> 3);
	uint lot = (me<<2);
	__local float2 *lp;
	float4 z00, z01, z10, z11, z20, z21, z30, z31;

	// Array offsets
	goa = go + (i << 5) + j * (HSTRIDE32) + (me >> 3)HSTRIDE;

	// Load A block
	gp = (__global float4 *)(gcomplx + goa);
	z00 = gp[0HSTRIDE/416];
	z01 = gp[0HSTRIDE/416 + 1];
	z10 = gp[1HSTRIDE/416];
	z11 = gp[1HSTRIDE/416 + 1];
	z20 = gp[2HSTRIDE/416];
	z21 = gp[2HSTRIDE/416 + 1];
	z30 = gp[3HSTRIDE/416];
	z31 = gp[3HSTRIDE/416 + 1];

	// Save into LDS
	lp = ldsa + lo;
	lp[0*8] = z00.xy;
	lp[1*8] = z10.xy;
	lp[2*8] = z20.xy;
	lp[3*8] = z30.xy;
	lp += 8*4;

	lp[0*8] = z00.zw;
	lp[1*8] = z10.zw;
	lp[2*8] = z20.zw;
	lp[3*8] = z30.zw;
	lp += 8*4;

	lp[0*8] = z01.xy;
	lp[1*8] = z11.xy;
	lp[2*8] = z21.xy;
	lp[3*8] = z31.xy;
	lp += 8*4;

	lp[0*8] = z01.zw;
	lp[1*8] = z11.zw;
	lp[2*8] = z21.zw;
	lp[3*8] = z31.zw;
	lp += 8*4;

	//End load A block

	// Load B block
	gob = go + (j << 5) + i * (HSTRIDE32) + (me >> 3)HSTRIDE;
	gp = (__global float4 *)(gcomplx + gob);
	z00 = gp[0HSTRIDE/416];
	z01 = gp[0HSTRIDE/416 + 1];
	z10 = gp[1HSTRIDE/416];
	z11 = gp[1HSTRIDE/416 + 1];
	z20 = gp[2HSTRIDE/416];
	z21 = gp[2HSTRIDE/416 + 1];
	z30 = gp[3HSTRIDE/416];
	z31 = gp[3HSTRIDE/416 + 1];

	// Save into LDS
	lp = ldsb + lo;
	lp[0*8] = z00.xy;
	lp[1*8] = z10.xy;
	lp[2*8] = z20.xy;
	lp[3*8] = z30.xy;
	lp += 8*4;

	lp[0*8] = z00.zw;
	lp[1*8] = z10.zw;
	lp[2*8] = z20.zw;
	lp[3*8] = z30.zw;
	lp += 8*4;

	lp[0*8] = z01.xy;
	lp[1*8] = z11.xy;
	lp[2*8] = z21.xy;
	lp[3*8] = z31.xy;
	lp += 8*4;

	lp[0*8] = z01.zw;
	lp[1*8] = z11.zw;
	lp[2*8] = z21.zw;
	lp[3*8] = z31.zw;
	lp += 8*4;

	// End load B block

	barrier(CLK_LOCAL_MEM_FENCE);

	// write A block
	goa = go + (i << 5) + j * (VSTRIDE32) + (me >> 3)VSTRIDE;
	gp = (__global float4 *)(gcomplx + goa);
	lp4 = (__local float4 *)(ldsb + lot);
	z00 = lp4[0];
	z01 = lp4[1];
	lp4 += 32*4;

	z10 = lp4[0];
	z11 = lp4[1];
	lp4 += 32*4;

	z20 = lp4[0];
	z21 = lp4[1];
	lp4 += 32*4;

	z30 = lp4[0];
	z31 = lp4[1];

	gp[0VSTRIDE/416] = z00;
	gp[0VSTRIDE/416+1] = z01;
	gp[1VSTRIDE/416] = z10;
	gp[1VSTRIDE/416+1] = z11;
	gp[2VSTRIDE/416] = z20;
	gp[2VSTRIDE/416+1] = z21;
	gp[3VSTRIDE/416] = z30;
	gp[3VSTRIDE/416+1] = z31;

	// End write A block;

	// write B block

	gob = go + (j << 5) + i * (VSTRIDE32) + (me >> 3)VSTRIDE;
	gp = (__global float4 *)(gcomplx + gob);
	lp4 = (__local float4 *)(ldsa + lot);

	z00 = lp4[0];
	z01 = lp4[1];
	lp4 += 32*4;

	z10 = lp4[0];
	z11 = lp4[1];
	lp4 += 32*4;

	z20 = lp4[0];
	z21 = lp4[1];
	lp4 += 32*4;

	z30 = lp4[0];
	z31 = lp4[1];

	gp[0VSTRIDE/416] = z00;
	gp[0VSTRIDE/416+1] = z01;
	gp[1VSTRIDE/416] = z10;
	gp[1VSTRIDE/416+1] = z11;
	gp[2VSTRIDE/416] = z20;
	gp[2VSTRIDE/416+1] = z21;
	gp[3VSTRIDE/416] = z30;
	gp[3VSTRIDE/416+1] = z31;

	// End write B block;

	}
	#include <stdio.h>
	#include <CL/cl.h>
	#include <iostream>
	#include <fstream>

	#define ERR(str, status) do { \
	printf("%s(%d):"str, \
	__FILE__, __LINE__, status); \
	return status; \
	} while(0)

	// for ocl calls that return errors
	#define OCL(call) do { \
	cl_int _err = (call); \
	if ( _err != CL_SUCCESS) { \
	ERR("OpenCL Error: %d\n", _err); \
	} \
	} while(0)


	#define CHECK(call) do { \
	cl_int _err_ = CL_SUCCESS; \
	call; \
	OCL(_err_); \
	} while(0)


	cl_context context = 0;
	cl_device_id device = 0;
	cl_platform_id platform = 0;
	cl_command_queue queue = 0;

	int init()
	{
	OCL(clGetPlatformIDs(1, &platform, NULL));
	OCL(clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU,
	1, &device, NULL));

	// Create context
	cl_context_properties props[] = {
	CL_CONTEXT_PLATFORM, (cl_context_properties)platform,
	0
	};
	CHECK(context = clCreateContext(props, 1, &device, NULL, NULL, &_err_));
	CHECK(queue = clCreateCommandQueue(context, device, 0, &_err_));

	size_t max_const_mem = 0;
	OCL(clGetDeviceInfo(device, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE,
	sizeof(size_t), &max_const_mem, NULL));

	printf("Const memory size: %zu\n", max_const_mem);

	return CL_SUCCESS;
	}

	int read_kernel(char *buffer, const char file_name)
	{
	std::ifstream ifs;
	ifs.open(file_name, std::ifstream::in);

	ifs.seekg(0, ifs.end);
	int length = ifs.tellg();
	ifs.seekg(0, ifs.beg);

	*buffer = new char[length];
	ifs.read(*buffer, length);
	ifs.close();

	return length;
	}

	int main()
	{
	OCL(init());

	char *trans_str = NULL;
	size_t length = (size_t)read_kernel(&trans_str, "trans.cl");

	int n = 1024;
	size_t sz = n * n * sizeof(cl_float2);

	cl_program program;
	cl_kernel kernel;
	cl_mem d_data = 0;

	CHECK(d_data = clCreateBuffer(context, CL_MEM_READ_WRITE, sz, NULL, &_err_););
	CHECK(program = clCreateProgramWithSource(context, 1,
	(const char **)&trans_str,
	&length, &_err_); );

	OCL(clBuildProgram(program, 0, NULL, "", NULL, NULL));
	CHECK(kernel = clCreateKernel(program, "fft_trans", &_err_););

	size_t local[] = {64, 1};
	size_t global[] = {528 * local[0], 1 * local[1]};

	OCL(clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_data));
	OCL(clEnqueueNDRangeKernel(queue, kernel, 1,
	NULL, global, local, 0, NULL, NULL));

	OCL(clFinish(queue));

	if (d_data) OCL(clReleaseMemObject(d_data));
	delete[] trans_str;
	return 0;
	}