Created
September 5, 2017 13:40
-
-
Save otherjason/58ef9d5d93304356c8f1d2633160d32b to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <cufftXt.h> | |
#include <iostream> | |
#include <stdint.h> | |
#include <stdio.h> | |
#include <thrust/complex.h> | |
#include <vector> | |
void check_error(cudaError_t e, const char *file=NULL, int line=0) | |
{ | |
if (e != cudaSuccess) { | |
std::cerr << "CUDA API error"; | |
if (file) std::cerr << " at " << file << ':' << line; | |
std::cerr << ": " << cudaGetErrorString(e) << " (" | |
<< cudaGetErrorName(e) << ')' << std::endl; | |
exit(1); | |
} | |
} | |
void check_error(cufftResult e, const char *file=NULL, int line=0) | |
{ | |
if (e==CUFFT_SUCCESS) | |
return; | |
const char * em=NULL; | |
switch (e) | |
{ | |
case CUFFT_INVALID_PLAN:em="CUFFT_INVALID_PLAN";break; | |
case CUFFT_ALLOC_FAILED:em="CUFFT_ALLOC_FAILED";break; | |
case CUFFT_INVALID_TYPE:em="CUFFT_INVALID_TYPE";break; | |
case CUFFT_INVALID_VALUE:em="CUFFT_INVALID_VALUE";break; | |
case CUFFT_INTERNAL_ERROR:em="CUFFT_INTERNAL_ERROR";break; | |
case CUFFT_EXEC_FAILED:em="CUFFT_EXEC_FAILED";break; | |
case CUFFT_SETUP_FAILED:em="CUFFT_SETUP_FAILED";break; | |
case CUFFT_INVALID_SIZE:em="CUFFT_INVALID_SIZE";break; | |
case CUFFT_UNALIGNED_DATA:em="CUFFT_UNALIGNED_DATA";break; | |
case CUFFT_INVALID_DEVICE:em="CUFFT_INVALID_DEVICE";break; | |
case CUFFT_NO_WORKSPACE:em="CUFFT_NO_WORKSPACE";break; | |
case CUFFT_NOT_IMPLEMENTED:em="CUFFT_NOT_IMPLEMENTED";break; | |
case CUFFT_LICENSE_ERROR:em="CUFFT_LICENSE_ERROR";break; | |
default:em="";break; | |
} | |
std::cerr << "CUFFT API error (" << e << ")" << em; | |
if (file) std::cerr << " at " << file << ':' << line; | |
std::cerr << std::endl; | |
exit(1); | |
} | |
#define CUDA_CHECK_ERROR(e) check_error(e,__FILE__,__LINE__) | |
// Simple callback function that treats the input buffer as 16-bit integers and converts to cufftReal. | |
__device__ cufftReal callback(void * inbuf, size_t fft_index, void *, void *) | |
{ | |
return reinterpret_cast<short *>(inbuf)[fft_index]; | |
} | |
// Callback function pointers on the device. | |
__device__ cufftCallbackLoadR d_callback = callback; | |
void do_transform(cufftReal *dev_in, thrust::complex<float> *dev_out, int nfft, int elem_pitch, int batch, bool use_callback) | |
{ | |
std::cout << "doing transform; callbacks: " << use_callback << "..."; | |
// Create the FFT plan. | |
cufftHandle plan; | |
CUDA_CHECK_ERROR(cufftPlanMany(&plan, 1, &nfft, &nfft, 1, nfft, &nfft, 1, elem_pitch, CUFFT_R2C, batch)); | |
// Attach the callback if needed. | |
if (use_callback) | |
{ | |
cufftCallbackLoadR load_callback_ptr; | |
CUDA_CHECK_ERROR(cudaMemcpyFromSymbol(&load_callback_ptr, d_callback, sizeof(load_callback_ptr))); | |
CUDA_CHECK_ERROR(cufftXtSetCallback(plan, (void **) &load_callback_ptr, CUFFT_CB_LD_REAL, NULL)); | |
} | |
// Create a stream to run the transforms on. | |
cudaStream_t stream; | |
cudaStreamCreate(&stream); | |
cufftSetStream(plan, stream); | |
// Create events to measure the time taken for the transform. | |
cudaEvent_t start, stop; | |
CUDA_CHECK_ERROR(cudaEventCreate(&start)); | |
CUDA_CHECK_ERROR(cudaEventCreate(&stop)); | |
// Execute the plan. | |
CUDA_CHECK_ERROR(cudaEventRecord(start, stream)); | |
for (int i = 0; i < 32; ++i) | |
{ | |
CUDA_CHECK_ERROR(cufftExecR2C(plan, (cufftReal *) dev_in, (cufftComplex *) dev_out)); | |
} | |
CUDA_CHECK_ERROR(cudaEventRecord(stop, stream)); | |
CUDA_CHECK_ERROR(cudaStreamSynchronize(stream)); | |
// Clean up the plan. | |
CUDA_CHECK_ERROR(cufftDestroy(plan)); | |
float time = 0; | |
CUDA_CHECK_ERROR(cudaEventElapsedTime(&time, start, stop)); | |
std::cout << "done; time: " << time << " msec" << std::endl; | |
} | |
void do_test(int nfft, int batch) | |
{ | |
// Allocate input/output buffers on the device. | |
cufftReal *dev_in; | |
thrust::complex<float> *dev_out1, *dev_out2; | |
size_t in_buf_elem = nfft * batch; | |
size_t in_buf_bytes = in_buf_elem * sizeof(cufftReal); | |
CUDA_CHECK_ERROR(cudaMalloc(&dev_in, in_buf_bytes)); | |
size_t out_row_len = nfft / 2 + 1; // number of elements per output transform | |
size_t byte_pitch; | |
CUDA_CHECK_ERROR(cudaMallocPitch(&dev_out1, &byte_pitch, out_row_len * sizeof(thrust::complex<float>), batch)); | |
CUDA_CHECK_ERROR(cudaMallocPitch(&dev_out2, &byte_pitch, out_row_len * sizeof(thrust::complex<float>), batch)); | |
size_t elem_pitch = byte_pitch / sizeof(thrust::complex<float>); | |
// Fill the input buffer with zeros to ensure that it represents valid integer and floating-point values at the same time. | |
CUDA_CHECK_ERROR(cudaMemset(dev_in, 0, in_buf_bytes)); | |
// Execute the transform with and without callbacks. | |
do_transform(dev_in, dev_out1, nfft, elem_pitch, batch, false); | |
do_transform(dev_in, dev_out2, nfft, elem_pitch, batch, true); | |
} | |
int main() | |
{ | |
do_test(32768, 32); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment