Skip to content

Instantly share code, notes, and snippets.

@pavanky
Last active December 25, 2015 19:59
Show Gist options
  • Save pavanky/7031425 to your computer and use it in GitHub Desktop.
Save pavanky/7031425 to your computer and use it in GitHub Desktop.
Test case for 2D fft
#include <stdio.h>
#include <clFFT.h>
#define ERR(str, status) do { \
printf("%s(%d):"str, \
__FILE__, __LINE__, status); \
return status; \
} while(0)
#define CLFFT(fn) do { \
clfftStatus _err = fn; \
if (_err != CLFFT_SUCCESS) { \
ERR("clFFT Error : %d\n", _err); \
} \
} while(0)
// for ocl calls that return errors
#define OCL(call) do { \
cl_int _err = (call); \
if ( _err != CL_SUCCESS) { \
ERR("OpenCL Error: %d\n", _err); \
} \
} while(0)
#define CHECK(call) do { \
cl_int _err_ = CL_SUCCESS; \
call; \
OCL(_err_); \
} while(0)
cl_context context = 0;
cl_device_id device = 0;
cl_platform_id platform = 0;
cl_command_queue queue = 0;
class _clfft_init
{
public:
_clfft_init()
{
// Initialize flags for FFT library
clfftSetupData* setupData = new clfftSetupData();
clfftInitSetupData(setupData);
clfftSetup(setupData);
}
~_clfft_init()
{
clfftTeardown();
}
};
static void clfft_init()
{
static _clfft_init _fft = _clfft_init();
return;
}
int init()
{
OCL(clGetPlatformIDs(1, &platform, NULL));
OCL(clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU,
1, &device, NULL));
// Create context
cl_context_properties props[] = {
CL_CONTEXT_PLATFORM, (cl_context_properties)platform,
0
};
CHECK(context = clCreateContext(props, 1, &device, NULL, NULL, &_err_));
CHECK(queue = clCreateCommandQueue(context, device, 0, &_err_));
size_t max_const_mem = 0;
OCL(clGetDeviceInfo(device, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE,
sizeof(size_t), &max_const_mem, NULL));
printf("Const memory size: %zu\n", max_const_mem);
clfft_init();
return CL_SUCCESS;
}
int main()
{
OCL(init());
clfftLayout layout = CLFFT_COMPLEX_INTERLEAVED;
clfftPrecision pres = CLFFT_SINGLE;
clfftResultLocation loc = CLFFT_INPLACE;
clfftDim dim = CLFFT_2D;
clfftDirection dir = CLFFT_FORWARD;
cl_float scale = 1.0;
unsigned batch = 1;
cl_mem d_idata;
cl_mem d_odata;
for(int n = 256; n <= 2048; n*=2) {
printf("%4d, %4d\n", n, n);
size_t lengths[3] = {n, n, 1};
size_t strides[4] = {1, 1, 1, 1};
for (int i = 1; i <= 3; i++) strides[i] = strides[i - 1] * lengths[i - 1];
size_t numel = strides[3];
CHECK(d_odata = clCreateBuffer(context, CL_MEM_READ_WRITE,
n * n * sizeof(cl_float2),
NULL, &_err_));
CHECK(d_idata = clCreateBuffer(context, CL_MEM_READ_WRITE,
n * n * sizeof(cl_float2),
NULL, &_err_));
clfftPlanHandle plan;
CLFFT(clfftCreateDefaultPlan(&plan, context, dim, lengths));
CLFFT(clfftSetPlanPrecision(plan, pres));
CLFFT(clfftSetPlanScale(plan, dir, scale));
CLFFT(clfftSetPlanBatchSize(plan, batch));
CLFFT(clfftSetPlanDim(plan, dim));
CLFFT(clfftSetPlanLength(plan, dim, lengths));
CLFFT(clfftSetPlanInStride(plan, dim, strides));
CLFFT(clfftSetPlanOutStride(plan, dim, strides));
CLFFT(clfftSetPlanDistance(plan, numel, numel));
CLFFT(clfftSetLayout(plan, layout, layout));
CLFFT(clfftSetResultLocation(plan, loc));
CLFFT(clfftBakePlan(plan, 1, &queue, NULL, NULL));
OCL(clFinish(queue));
// Perform the transform
CLFFT(clfftEnqueueTransform(plan, dir, 1, &queue, 0, NULL, NULL,
&d_idata, NULL, NULL));
OCL(clFinish(queue));
CLFFT(clfftDestroyPlan(&plan));
OCL(clReleaseMemObject(d_idata));
OCL(clReleaseMemObject(d_odata));
}
return 0;
}
#define HSTRIDE 1024
#define VSTRIDE 1024
#define DIM 32
__attribute__((reqd_work_group_size(64,1,1)))
__kernel void
fft_trans(__global float2 * gcomplx)
{
__local float2 ldsa[1024];
__local float2 ldsb[1024];
uint gid = get_global_id(0);
uint me = gid & 0x3fU;
uint k = (gid >> 6) % 528;
// Compute location of blocks
int l = DIM+0.5f - native_sqrt((DIM+0.5f)*(DIM+0.5f) - 2.0f * (float)as_int(k));
int kl = ((DIM*2+1 - l) * l) >> 1;
uint j = k - kl;
uint i = l + j;
uint goa, gob;
uint go = ((me & 0x7U) << 2) + ((gid>>6)/528) * VSTRIDE * HSTRIDE;
__global float4 *gp;
__local float4 *lp4;
uint lo = ((me & 0x7U) << 7) + (me >> 3);
uint lot = (me<<2);
__local float2 *lp;
float4 z00, z01, z10, z11, z20, z21, z30, z31;
// Array offsets
goa = go + (i << 5) + j * (HSTRIDE*32) + (me >> 3)*HSTRIDE;
// Load A block
gp = (__global float4 *)(gcomplx + goa);
z00 = gp[0*HSTRIDE/4*16];
z01 = gp[0*HSTRIDE/4*16 + 1];
z10 = gp[1*HSTRIDE/4*16];
z11 = gp[1*HSTRIDE/4*16 + 1];
z20 = gp[2*HSTRIDE/4*16];
z21 = gp[2*HSTRIDE/4*16 + 1];
z30 = gp[3*HSTRIDE/4*16];
z31 = gp[3*HSTRIDE/4*16 + 1];
// Save into LDS
lp = ldsa + lo;
lp[0*8] = z00.xy;
lp[1*8] = z10.xy;
lp[2*8] = z20.xy;
lp[3*8] = z30.xy;
lp += 8*4;
lp[0*8] = z00.zw;
lp[1*8] = z10.zw;
lp[2*8] = z20.zw;
lp[3*8] = z30.zw;
lp += 8*4;
lp[0*8] = z01.xy;
lp[1*8] = z11.xy;
lp[2*8] = z21.xy;
lp[3*8] = z31.xy;
lp += 8*4;
lp[0*8] = z01.zw;
lp[1*8] = z11.zw;
lp[2*8] = z21.zw;
lp[3*8] = z31.zw;
lp += 8*4;
//End load A block
// Load B block
gob = go + (j << 5) + i * (HSTRIDE*32) + (me >> 3)*HSTRIDE;
gp = (__global float4 *)(gcomplx + gob);
z00 = gp[0*HSTRIDE/4*16];
z01 = gp[0*HSTRIDE/4*16 + 1];
z10 = gp[1*HSTRIDE/4*16];
z11 = gp[1*HSTRIDE/4*16 + 1];
z20 = gp[2*HSTRIDE/4*16];
z21 = gp[2*HSTRIDE/4*16 + 1];
z30 = gp[3*HSTRIDE/4*16];
z31 = gp[3*HSTRIDE/4*16 + 1];
// Save into LDS
lp = ldsb + lo;
lp[0*8] = z00.xy;
lp[1*8] = z10.xy;
lp[2*8] = z20.xy;
lp[3*8] = z30.xy;
lp += 8*4;
lp[0*8] = z00.zw;
lp[1*8] = z10.zw;
lp[2*8] = z20.zw;
lp[3*8] = z30.zw;
lp += 8*4;
lp[0*8] = z01.xy;
lp[1*8] = z11.xy;
lp[2*8] = z21.xy;
lp[3*8] = z31.xy;
lp += 8*4;
lp[0*8] = z01.zw;
lp[1*8] = z11.zw;
lp[2*8] = z21.zw;
lp[3*8] = z31.zw;
lp += 8*4;
// End load B block
barrier(CLK_LOCAL_MEM_FENCE);
// write A block
goa = go + (i << 5) + j * (VSTRIDE*32) + (me >> 3)*VSTRIDE;
gp = (__global float4 *)(gcomplx + goa);
lp4 = (__local float4 *)(ldsb + lot);
z00 = lp4[0];
z01 = lp4[1];
lp4 += 32*4;
z10 = lp4[0];
z11 = lp4[1];
lp4 += 32*4;
z20 = lp4[0];
z21 = lp4[1];
lp4 += 32*4;
z30 = lp4[0];
z31 = lp4[1];
gp[0*VSTRIDE/4*16] = z00;
gp[0*VSTRIDE/4*16+1] = z01;
gp[1*VSTRIDE/4*16] = z10;
gp[1*VSTRIDE/4*16+1] = z11;
gp[2*VSTRIDE/4*16] = z20;
gp[2*VSTRIDE/4*16+1] = z21;
gp[3*VSTRIDE/4*16] = z30;
gp[3*VSTRIDE/4*16+1] = z31;
// End write A block;
// write B block
gob = go + (j << 5) + i * (VSTRIDE*32) + (me >> 3)*VSTRIDE;
gp = (__global float4 *)(gcomplx + gob);
lp4 = (__local float4 *)(ldsa + lot);
z00 = lp4[0];
z01 = lp4[1];
lp4 += 32*4;
z10 = lp4[0];
z11 = lp4[1];
lp4 += 32*4;
z20 = lp4[0];
z21 = lp4[1];
lp4 += 32*4;
z30 = lp4[0];
z31 = lp4[1];
gp[0*VSTRIDE/4*16] = z00;
gp[0*VSTRIDE/4*16+1] = z01;
gp[1*VSTRIDE/4*16] = z10;
gp[1*VSTRIDE/4*16+1] = z11;
gp[2*VSTRIDE/4*16] = z20;
gp[2*VSTRIDE/4*16+1] = z21;
gp[3*VSTRIDE/4*16] = z30;
gp[3*VSTRIDE/4*16+1] = z31;
// End write B block;
}
#include <stdio.h>
#include <CL/cl.h>
#include <iostream>
#include <fstream>
#define ERR(str, status) do { \
printf("%s(%d):"str, \
__FILE__, __LINE__, status); \
return status; \
} while(0)
// for ocl calls that return errors
#define OCL(call) do { \
cl_int _err = (call); \
if ( _err != CL_SUCCESS) { \
ERR("OpenCL Error: %d\n", _err); \
} \
} while(0)
#define CHECK(call) do { \
cl_int _err_ = CL_SUCCESS; \
call; \
OCL(_err_); \
} while(0)
cl_context context = 0;
cl_device_id device = 0;
cl_platform_id platform = 0;
cl_command_queue queue = 0;
int init()
{
OCL(clGetPlatformIDs(1, &platform, NULL));
OCL(clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU,
1, &device, NULL));
// Create context
cl_context_properties props[] = {
CL_CONTEXT_PLATFORM, (cl_context_properties)platform,
0
};
CHECK(context = clCreateContext(props, 1, &device, NULL, NULL, &_err_));
CHECK(queue = clCreateCommandQueue(context, device, 0, &_err_));
size_t max_const_mem = 0;
OCL(clGetDeviceInfo(device, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE,
sizeof(size_t), &max_const_mem, NULL));
printf("Const memory size: %zu\n", max_const_mem);
return CL_SUCCESS;
}
int read_kernel(char **buffer, const char *file_name)
{
std::ifstream ifs;
ifs.open(file_name, std::ifstream::in);
ifs.seekg(0, ifs.end);
int length = ifs.tellg();
ifs.seekg(0, ifs.beg);
*buffer = new char[length];
ifs.read(*buffer, length);
ifs.close();
return length;
}
int main()
{
OCL(init());
char *trans_str = NULL;
size_t length = (size_t)read_kernel(&trans_str, "trans.cl");
int n = 1024;
size_t sz = n * n * sizeof(cl_float2);
cl_program program;
cl_kernel kernel;
cl_mem d_data = 0;
CHECK(d_data = clCreateBuffer(context, CL_MEM_READ_WRITE, sz, NULL, &_err_););
CHECK(program = clCreateProgramWithSource(context, 1,
(const char **)&trans_str,
&length, &_err_); );
OCL(clBuildProgram(program, 0, NULL, "", NULL, NULL));
CHECK(kernel = clCreateKernel(program, "fft_trans", &_err_););
size_t local[] = {64, 1};
size_t global[] = {528 * local[0], 1 * local[1]};
OCL(clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_data));
OCL(clEnqueueNDRangeKernel(queue, kernel, 1,
NULL, global, local, 0, NULL, NULL));
OCL(clFinish(queue));
if (d_data) OCL(clReleaseMemObject(d_data));
delete[] trans_str;
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment