Last active
December 25, 2015 19:59
-
-
Save pavanky/7031425 to your computer and use it in GitHub Desktop.
Test case for 2D fft
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include <clFFT.h> | |
#define ERR(str, status) do { \ | |
printf("%s(%d):"str, \ | |
__FILE__, __LINE__, status); \ | |
return status; \ | |
} while(0) | |
#define CLFFT(fn) do { \ | |
clfftStatus _err = fn; \ | |
if (_err != CLFFT_SUCCESS) { \ | |
ERR("clFFT Error : %d\n", _err); \ | |
} \ | |
} while(0) | |
// for ocl calls that return errors | |
#define OCL(call) do { \ | |
cl_int _err = (call); \ | |
if ( _err != CL_SUCCESS) { \ | |
ERR("OpenCL Error: %d\n", _err); \ | |
} \ | |
} while(0) | |
#define CHECK(call) do { \ | |
cl_int _err_ = CL_SUCCESS; \ | |
call; \ | |
OCL(_err_); \ | |
} while(0) | |
cl_context context = 0; | |
cl_device_id device = 0; | |
cl_platform_id platform = 0; | |
cl_command_queue queue = 0; | |
class _clfft_init | |
{ | |
public: | |
_clfft_init() | |
{ | |
// Initialize flags for FFT library | |
clfftSetupData* setupData = new clfftSetupData(); | |
clfftInitSetupData(setupData); | |
clfftSetup(setupData); | |
} | |
~_clfft_init() | |
{ | |
clfftTeardown(); | |
} | |
}; | |
static void clfft_init() | |
{ | |
static _clfft_init _fft = _clfft_init(); | |
return; | |
} | |
int init() | |
{ | |
OCL(clGetPlatformIDs(1, &platform, NULL)); | |
OCL(clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, | |
1, &device, NULL)); | |
// Create context | |
cl_context_properties props[] = { | |
CL_CONTEXT_PLATFORM, (cl_context_properties)platform, | |
0 | |
}; | |
CHECK(context = clCreateContext(props, 1, &device, NULL, NULL, &_err_)); | |
CHECK(queue = clCreateCommandQueue(context, device, 0, &_err_)); | |
size_t max_const_mem = 0; | |
OCL(clGetDeviceInfo(device, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, | |
sizeof(size_t), &max_const_mem, NULL)); | |
printf("Const memory size: %zu\n", max_const_mem); | |
clfft_init(); | |
return CL_SUCCESS; | |
} | |
int main() | |
{ | |
OCL(init()); | |
clfftLayout layout = CLFFT_COMPLEX_INTERLEAVED; | |
clfftPrecision pres = CLFFT_SINGLE; | |
clfftResultLocation loc = CLFFT_INPLACE; | |
clfftDim dim = CLFFT_2D; | |
clfftDirection dir = CLFFT_FORWARD; | |
cl_float scale = 1.0; | |
unsigned batch = 1; | |
cl_mem d_idata; | |
cl_mem d_odata; | |
for(int n = 256; n <= 2048; n*=2) { | |
printf("%4d, %4d\n", n, n); | |
size_t lengths[3] = {n, n, 1}; | |
size_t strides[4] = {1, 1, 1, 1}; | |
for (int i = 1; i <= 3; i++) strides[i] = strides[i - 1] * lengths[i - 1]; | |
size_t numel = strides[3]; | |
CHECK(d_odata = clCreateBuffer(context, CL_MEM_READ_WRITE, | |
n * n * sizeof(cl_float2), | |
NULL, &_err_)); | |
CHECK(d_idata = clCreateBuffer(context, CL_MEM_READ_WRITE, | |
n * n * sizeof(cl_float2), | |
NULL, &_err_)); | |
clfftPlanHandle plan; | |
CLFFT(clfftCreateDefaultPlan(&plan, context, dim, lengths)); | |
CLFFT(clfftSetPlanPrecision(plan, pres)); | |
CLFFT(clfftSetPlanScale(plan, dir, scale)); | |
CLFFT(clfftSetPlanBatchSize(plan, batch)); | |
CLFFT(clfftSetPlanDim(plan, dim)); | |
CLFFT(clfftSetPlanLength(plan, dim, lengths)); | |
CLFFT(clfftSetPlanInStride(plan, dim, strides)); | |
CLFFT(clfftSetPlanOutStride(plan, dim, strides)); | |
CLFFT(clfftSetPlanDistance(plan, numel, numel)); | |
CLFFT(clfftSetLayout(plan, layout, layout)); | |
CLFFT(clfftSetResultLocation(plan, loc)); | |
CLFFT(clfftBakePlan(plan, 1, &queue, NULL, NULL)); | |
OCL(clFinish(queue)); | |
// Perform the transform | |
CLFFT(clfftEnqueueTransform(plan, dir, 1, &queue, 0, NULL, NULL, | |
&d_idata, NULL, NULL)); | |
OCL(clFinish(queue)); | |
CLFFT(clfftDestroyPlan(&plan)); | |
OCL(clReleaseMemObject(d_idata)); | |
OCL(clReleaseMemObject(d_odata)); | |
} | |
return 0; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#define HSTRIDE 1024 | |
#define VSTRIDE 1024 | |
#define DIM 32 | |
__attribute__((reqd_work_group_size(64,1,1))) | |
__kernel void | |
fft_trans(__global float2 * gcomplx) | |
{ | |
__local float2 ldsa[1024]; | |
__local float2 ldsb[1024]; | |
uint gid = get_global_id(0); | |
uint me = gid & 0x3fU; | |
uint k = (gid >> 6) % 528; | |
// Compute location of blocks | |
int l = DIM+0.5f - native_sqrt((DIM+0.5f)*(DIM+0.5f) - 2.0f * (float)as_int(k)); | |
int kl = ((DIM*2+1 - l) * l) >> 1; | |
uint j = k - kl; | |
uint i = l + j; | |
uint goa, gob; | |
uint go = ((me & 0x7U) << 2) + ((gid>>6)/528) * VSTRIDE * HSTRIDE; | |
__global float4 *gp; | |
__local float4 *lp4; | |
uint lo = ((me & 0x7U) << 7) + (me >> 3); | |
uint lot = (me<<2); | |
__local float2 *lp; | |
float4 z00, z01, z10, z11, z20, z21, z30, z31; | |
// Array offsets | |
goa = go + (i << 5) + j * (HSTRIDE*32) + (me >> 3)*HSTRIDE; | |
// Load A block | |
gp = (__global float4 *)(gcomplx + goa); | |
z00 = gp[0*HSTRIDE/4*16]; | |
z01 = gp[0*HSTRIDE/4*16 + 1]; | |
z10 = gp[1*HSTRIDE/4*16]; | |
z11 = gp[1*HSTRIDE/4*16 + 1]; | |
z20 = gp[2*HSTRIDE/4*16]; | |
z21 = gp[2*HSTRIDE/4*16 + 1]; | |
z30 = gp[3*HSTRIDE/4*16]; | |
z31 = gp[3*HSTRIDE/4*16 + 1]; | |
// Save into LDS | |
lp = ldsa + lo; | |
lp[0*8] = z00.xy; | |
lp[1*8] = z10.xy; | |
lp[2*8] = z20.xy; | |
lp[3*8] = z30.xy; | |
lp += 8*4; | |
lp[0*8] = z00.zw; | |
lp[1*8] = z10.zw; | |
lp[2*8] = z20.zw; | |
lp[3*8] = z30.zw; | |
lp += 8*4; | |
lp[0*8] = z01.xy; | |
lp[1*8] = z11.xy; | |
lp[2*8] = z21.xy; | |
lp[3*8] = z31.xy; | |
lp += 8*4; | |
lp[0*8] = z01.zw; | |
lp[1*8] = z11.zw; | |
lp[2*8] = z21.zw; | |
lp[3*8] = z31.zw; | |
lp += 8*4; | |
//End load A block | |
// Load B block | |
gob = go + (j << 5) + i * (HSTRIDE*32) + (me >> 3)*HSTRIDE; | |
gp = (__global float4 *)(gcomplx + gob); | |
z00 = gp[0*HSTRIDE/4*16]; | |
z01 = gp[0*HSTRIDE/4*16 + 1]; | |
z10 = gp[1*HSTRIDE/4*16]; | |
z11 = gp[1*HSTRIDE/4*16 + 1]; | |
z20 = gp[2*HSTRIDE/4*16]; | |
z21 = gp[2*HSTRIDE/4*16 + 1]; | |
z30 = gp[3*HSTRIDE/4*16]; | |
z31 = gp[3*HSTRIDE/4*16 + 1]; | |
// Save into LDS | |
lp = ldsb + lo; | |
lp[0*8] = z00.xy; | |
lp[1*8] = z10.xy; | |
lp[2*8] = z20.xy; | |
lp[3*8] = z30.xy; | |
lp += 8*4; | |
lp[0*8] = z00.zw; | |
lp[1*8] = z10.zw; | |
lp[2*8] = z20.zw; | |
lp[3*8] = z30.zw; | |
lp += 8*4; | |
lp[0*8] = z01.xy; | |
lp[1*8] = z11.xy; | |
lp[2*8] = z21.xy; | |
lp[3*8] = z31.xy; | |
lp += 8*4; | |
lp[0*8] = z01.zw; | |
lp[1*8] = z11.zw; | |
lp[2*8] = z21.zw; | |
lp[3*8] = z31.zw; | |
lp += 8*4; | |
// End load B block | |
barrier(CLK_LOCAL_MEM_FENCE); | |
// write A block | |
goa = go + (i << 5) + j * (VSTRIDE*32) + (me >> 3)*VSTRIDE; | |
gp = (__global float4 *)(gcomplx + goa); | |
lp4 = (__local float4 *)(ldsb + lot); | |
z00 = lp4[0]; | |
z01 = lp4[1]; | |
lp4 += 32*4; | |
z10 = lp4[0]; | |
z11 = lp4[1]; | |
lp4 += 32*4; | |
z20 = lp4[0]; | |
z21 = lp4[1]; | |
lp4 += 32*4; | |
z30 = lp4[0]; | |
z31 = lp4[1]; | |
gp[0*VSTRIDE/4*16] = z00; | |
gp[0*VSTRIDE/4*16+1] = z01; | |
gp[1*VSTRIDE/4*16] = z10; | |
gp[1*VSTRIDE/4*16+1] = z11; | |
gp[2*VSTRIDE/4*16] = z20; | |
gp[2*VSTRIDE/4*16+1] = z21; | |
gp[3*VSTRIDE/4*16] = z30; | |
gp[3*VSTRIDE/4*16+1] = z31; | |
// End write A block; | |
// write B block | |
gob = go + (j << 5) + i * (VSTRIDE*32) + (me >> 3)*VSTRIDE; | |
gp = (__global float4 *)(gcomplx + gob); | |
lp4 = (__local float4 *)(ldsa + lot); | |
z00 = lp4[0]; | |
z01 = lp4[1]; | |
lp4 += 32*4; | |
z10 = lp4[0]; | |
z11 = lp4[1]; | |
lp4 += 32*4; | |
z20 = lp4[0]; | |
z21 = lp4[1]; | |
lp4 += 32*4; | |
z30 = lp4[0]; | |
z31 = lp4[1]; | |
gp[0*VSTRIDE/4*16] = z00; | |
gp[0*VSTRIDE/4*16+1] = z01; | |
gp[1*VSTRIDE/4*16] = z10; | |
gp[1*VSTRIDE/4*16+1] = z11; | |
gp[2*VSTRIDE/4*16] = z20; | |
gp[2*VSTRIDE/4*16+1] = z21; | |
gp[3*VSTRIDE/4*16] = z30; | |
gp[3*VSTRIDE/4*16+1] = z31; | |
// End write B block; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include <CL/cl.h> | |
#include <iostream> | |
#include <fstream> | |
#define ERR(str, status) do { \ | |
printf("%s(%d):"str, \ | |
__FILE__, __LINE__, status); \ | |
return status; \ | |
} while(0) | |
// for ocl calls that return errors | |
#define OCL(call) do { \ | |
cl_int _err = (call); \ | |
if ( _err != CL_SUCCESS) { \ | |
ERR("OpenCL Error: %d\n", _err); \ | |
} \ | |
} while(0) | |
#define CHECK(call) do { \ | |
cl_int _err_ = CL_SUCCESS; \ | |
call; \ | |
OCL(_err_); \ | |
} while(0) | |
cl_context context = 0; | |
cl_device_id device = 0; | |
cl_platform_id platform = 0; | |
cl_command_queue queue = 0; | |
int init() | |
{ | |
OCL(clGetPlatformIDs(1, &platform, NULL)); | |
OCL(clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, | |
1, &device, NULL)); | |
// Create context | |
cl_context_properties props[] = { | |
CL_CONTEXT_PLATFORM, (cl_context_properties)platform, | |
0 | |
}; | |
CHECK(context = clCreateContext(props, 1, &device, NULL, NULL, &_err_)); | |
CHECK(queue = clCreateCommandQueue(context, device, 0, &_err_)); | |
size_t max_const_mem = 0; | |
OCL(clGetDeviceInfo(device, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, | |
sizeof(size_t), &max_const_mem, NULL)); | |
printf("Const memory size: %zu\n", max_const_mem); | |
return CL_SUCCESS; | |
} | |
int read_kernel(char **buffer, const char *file_name) | |
{ | |
std::ifstream ifs; | |
ifs.open(file_name, std::ifstream::in); | |
ifs.seekg(0, ifs.end); | |
int length = ifs.tellg(); | |
ifs.seekg(0, ifs.beg); | |
*buffer = new char[length]; | |
ifs.read(*buffer, length); | |
ifs.close(); | |
return length; | |
} | |
int main() | |
{ | |
OCL(init()); | |
char *trans_str = NULL; | |
size_t length = (size_t)read_kernel(&trans_str, "trans.cl"); | |
int n = 1024; | |
size_t sz = n * n * sizeof(cl_float2); | |
cl_program program; | |
cl_kernel kernel; | |
cl_mem d_data = 0; | |
CHECK(d_data = clCreateBuffer(context, CL_MEM_READ_WRITE, sz, NULL, &_err_);); | |
CHECK(program = clCreateProgramWithSource(context, 1, | |
(const char **)&trans_str, | |
&length, &_err_); ); | |
OCL(clBuildProgram(program, 0, NULL, "", NULL, NULL)); | |
CHECK(kernel = clCreateKernel(program, "fft_trans", &_err_);); | |
size_t local[] = {64, 1}; | |
size_t global[] = {528 * local[0], 1 * local[1]}; | |
OCL(clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_data)); | |
OCL(clEnqueueNDRangeKernel(queue, kernel, 1, | |
NULL, global, local, 0, NULL, NULL)); | |
OCL(clFinish(queue)); | |
if (d_data) OCL(clReleaseMemObject(d_data)); | |
delete[] trans_str; | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment