Skip to content

Instantly share code, notes, and snippets.

@otherjason
Created September 5, 2017 13:40
Show Gist options
  • Save otherjason/58ef9d5d93304356c8f1d2633160d32b to your computer and use it in GitHub Desktop.
Save otherjason/58ef9d5d93304356c8f1d2633160d32b to your computer and use it in GitHub Desktop.
#include <cufftXt.h>
#include <iostream>
#include <stdint.h>
#include <stdio.h>
#include <thrust/complex.h>
#include <vector>
void check_error(cudaError_t e, const char *file=NULL, int line=0)
{
if (e != cudaSuccess) {
std::cerr << "CUDA API error";
if (file) std::cerr << " at " << file << ':' << line;
std::cerr << ": " << cudaGetErrorString(e) << " ("
<< cudaGetErrorName(e) << ')' << std::endl;
exit(1);
}
}
void check_error(cufftResult e, const char *file=NULL, int line=0)
{
if (e==CUFFT_SUCCESS)
return;
const char * em=NULL;
switch (e)
{
case CUFFT_INVALID_PLAN:em="CUFFT_INVALID_PLAN";break;
case CUFFT_ALLOC_FAILED:em="CUFFT_ALLOC_FAILED";break;
case CUFFT_INVALID_TYPE:em="CUFFT_INVALID_TYPE";break;
case CUFFT_INVALID_VALUE:em="CUFFT_INVALID_VALUE";break;
case CUFFT_INTERNAL_ERROR:em="CUFFT_INTERNAL_ERROR";break;
case CUFFT_EXEC_FAILED:em="CUFFT_EXEC_FAILED";break;
case CUFFT_SETUP_FAILED:em="CUFFT_SETUP_FAILED";break;
case CUFFT_INVALID_SIZE:em="CUFFT_INVALID_SIZE";break;
case CUFFT_UNALIGNED_DATA:em="CUFFT_UNALIGNED_DATA";break;
case CUFFT_INVALID_DEVICE:em="CUFFT_INVALID_DEVICE";break;
case CUFFT_NO_WORKSPACE:em="CUFFT_NO_WORKSPACE";break;
case CUFFT_NOT_IMPLEMENTED:em="CUFFT_NOT_IMPLEMENTED";break;
case CUFFT_LICENSE_ERROR:em="CUFFT_LICENSE_ERROR";break;
default:em="";break;
}
std::cerr << "CUFFT API error (" << e << ")" << em;
if (file) std::cerr << " at " << file << ':' << line;
std::cerr << std::endl;
exit(1);
}
#define CUDA_CHECK_ERROR(e) check_error(e,__FILE__,__LINE__)
// Simple callback function that treats the input buffer as 16-bit integers and converts to cufftReal.
__device__ cufftReal callback(void * inbuf, size_t fft_index, void *, void *)
{
return reinterpret_cast<short *>(inbuf)[fft_index];
}
// Callback function pointers on the device.
__device__ cufftCallbackLoadR d_callback = callback;
void do_transform(cufftReal *dev_in, thrust::complex<float> *dev_out, int nfft, int elem_pitch, int batch, bool use_callback)
{
std::cout << "doing transform; callbacks: " << use_callback << "...";
// Create the FFT plan.
cufftHandle plan;
CUDA_CHECK_ERROR(cufftPlanMany(&plan, 1, &nfft, &nfft, 1, nfft, &nfft, 1, elem_pitch, CUFFT_R2C, batch));
// Attach the callback if needed.
if (use_callback)
{
cufftCallbackLoadR load_callback_ptr;
CUDA_CHECK_ERROR(cudaMemcpyFromSymbol(&load_callback_ptr, d_callback, sizeof(load_callback_ptr)));
CUDA_CHECK_ERROR(cufftXtSetCallback(plan, (void **) &load_callback_ptr, CUFFT_CB_LD_REAL, NULL));
}
// Create a stream to run the transforms on.
cudaStream_t stream;
cudaStreamCreate(&stream);
cufftSetStream(plan, stream);
// Create events to measure the time taken for the transform.
cudaEvent_t start, stop;
CUDA_CHECK_ERROR(cudaEventCreate(&start));
CUDA_CHECK_ERROR(cudaEventCreate(&stop));
// Execute the plan.
CUDA_CHECK_ERROR(cudaEventRecord(start, stream));
for (int i = 0; i < 32; ++i)
{
CUDA_CHECK_ERROR(cufftExecR2C(plan, (cufftReal *) dev_in, (cufftComplex *) dev_out));
}
CUDA_CHECK_ERROR(cudaEventRecord(stop, stream));
CUDA_CHECK_ERROR(cudaStreamSynchronize(stream));
// Clean up the plan.
CUDA_CHECK_ERROR(cufftDestroy(plan));
float time = 0;
CUDA_CHECK_ERROR(cudaEventElapsedTime(&time, start, stop));
std::cout << "done; time: " << time << " msec" << std::endl;
}
void do_test(int nfft, int batch)
{
// Allocate input/output buffers on the device.
cufftReal *dev_in;
thrust::complex<float> *dev_out1, *dev_out2;
size_t in_buf_elem = nfft * batch;
size_t in_buf_bytes = in_buf_elem * sizeof(cufftReal);
CUDA_CHECK_ERROR(cudaMalloc(&dev_in, in_buf_bytes));
size_t out_row_len = nfft / 2 + 1; // number of elements per output transform
size_t byte_pitch;
CUDA_CHECK_ERROR(cudaMallocPitch(&dev_out1, &byte_pitch, out_row_len * sizeof(thrust::complex<float>), batch));
CUDA_CHECK_ERROR(cudaMallocPitch(&dev_out2, &byte_pitch, out_row_len * sizeof(thrust::complex<float>), batch));
size_t elem_pitch = byte_pitch / sizeof(thrust::complex<float>);
// Fill the input buffer with zeros to ensure that it represents valid integer and floating-point values at the same time.
CUDA_CHECK_ERROR(cudaMemset(dev_in, 0, in_buf_bytes));
// Execute the transform with and without callbacks.
do_transform(dev_in, dev_out1, nfft, elem_pitch, batch, false);
do_transform(dev_in, dev_out2, nfft, elem_pitch, batch, true);
}
int main()
{
do_test(32768, 32);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment