otherjason/cufft_9_regression.cu

## cufft_9_regression.cu
#include <cufftXt.h>
#include <iostream>
#include <stdint.h>
#include <stdio.h>
#include <thrust/complex.h>
#include <vector>

void check_error(cudaError_t e, const char *file=NULL, int line=0)
{
    if (e != cudaSuccess) {
        std::cerr << "CUDA API error";
        if (file) std::cerr << " at " << file << ':' << line;
        std::cerr << ": " << cudaGetErrorString(e) << " ("
            << cudaGetErrorName(e) << ')' << std::endl;
        exit(1);
    }
}

void check_error(cufftResult e, const char *file=NULL, int line=0)
{
    if (e==CUFFT_SUCCESS)
        return;
    const char * em=NULL;
    switch (e)
    {
        case CUFFT_INVALID_PLAN:em="CUFFT_INVALID_PLAN";break;
        case CUFFT_ALLOC_FAILED:em="CUFFT_ALLOC_FAILED";break;
        case CUFFT_INVALID_TYPE:em="CUFFT_INVALID_TYPE";break;
        case CUFFT_INVALID_VALUE:em="CUFFT_INVALID_VALUE";break;
        case CUFFT_INTERNAL_ERROR:em="CUFFT_INTERNAL_ERROR";break;
        case CUFFT_EXEC_FAILED:em="CUFFT_EXEC_FAILED";break;
        case CUFFT_SETUP_FAILED:em="CUFFT_SETUP_FAILED";break;
        case CUFFT_INVALID_SIZE:em="CUFFT_INVALID_SIZE";break;
        case CUFFT_UNALIGNED_DATA:em="CUFFT_UNALIGNED_DATA";break;
        case CUFFT_INVALID_DEVICE:em="CUFFT_INVALID_DEVICE";break;
        case CUFFT_NO_WORKSPACE:em="CUFFT_NO_WORKSPACE";break;
        case CUFFT_NOT_IMPLEMENTED:em="CUFFT_NOT_IMPLEMENTED";break;
        case CUFFT_LICENSE_ERROR:em="CUFFT_LICENSE_ERROR";break;
        default:em="";break;
    }
    std::cerr << "CUFFT API error (" << e << ")" << em;
    if (file) std::cerr << " at " << file << ':' << line;
    std::cerr << std::endl;
    exit(1);
}

#define CUDA_CHECK_ERROR(e) check_error(e,__FILE__,__LINE__)

// Simple callback function that treats the input buffer as 16-bit integers and converts to cufftReal.
__device__ cufftReal callback(void * inbuf, size_t fft_index, void *, void *)
{
    return reinterpret_cast<short *>(inbuf)[fft_index];
}

// Callback function pointers on the device.
__device__ cufftCallbackLoadR d_callback = callback;

void do_transform(cufftReal *dev_in, thrust::complex<float> *dev_out, int nfft, int elem_pitch, int batch, bool use_callback)
{
    std::cout << "doing transform; callbacks: " << use_callback << "...";
    // Create the FFT plan.
    cufftHandle plan;
    CUDA_CHECK_ERROR(cufftPlanMany(&plan, 1, &nfft, &nfft, 1, nfft, &nfft, 1, elem_pitch, CUFFT_R2C, batch));

    // Attach the callback if needed.
    if (use_callback)
    {
        cufftCallbackLoadR load_callback_ptr;
        CUDA_CHECK_ERROR(cudaMemcpyFromSymbol(&load_callback_ptr, d_callback, sizeof(load_callback_ptr)));
        CUDA_CHECK_ERROR(cufftXtSetCallback(plan, (void **) &load_callback_ptr, CUFFT_CB_LD_REAL, NULL));
    }

    // Create a stream to run the transforms on.
    cudaStream_t stream;
    cudaStreamCreate(&stream);
    cufftSetStream(plan, stream);

    // Create events to measure the time taken for the transform.
    cudaEvent_t start, stop;
    CUDA_CHECK_ERROR(cudaEventCreate(&start));
    CUDA_CHECK_ERROR(cudaEventCreate(&stop));

    // Execute the plan.
    CUDA_CHECK_ERROR(cudaEventRecord(start, stream));
    for (int i = 0; i < 32; ++i)
    {
        CUDA_CHECK_ERROR(cufftExecR2C(plan, (cufftReal *) dev_in, (cufftComplex *) dev_out));
    }
    CUDA_CHECK_ERROR(cudaEventRecord(stop, stream));
    CUDA_CHECK_ERROR(cudaStreamSynchronize(stream));

    // Clean up the plan.
    CUDA_CHECK_ERROR(cufftDestroy(plan));
    float time = 0;
    CUDA_CHECK_ERROR(cudaEventElapsedTime(&time, start, stop));
    std::cout << "done; time: " << time << " msec" << std::endl;
}

void do_test(int nfft, int batch)
{
    // Allocate input/output buffers on the device.
    cufftReal *dev_in;
    thrust::complex<float> *dev_out1, *dev_out2;
    size_t in_buf_elem = nfft * batch;
    size_t in_buf_bytes = in_buf_elem * sizeof(cufftReal);
    CUDA_CHECK_ERROR(cudaMalloc(&dev_in, in_buf_bytes));

    size_t out_row_len = nfft / 2 + 1;  // number of elements per output transform
    size_t byte_pitch;
    CUDA_CHECK_ERROR(cudaMallocPitch(&dev_out1, &byte_pitch, out_row_len * sizeof(thrust::complex<float>), batch));
    CUDA_CHECK_ERROR(cudaMallocPitch(&dev_out2, &byte_pitch, out_row_len * sizeof(thrust::complex<float>), batch));
    size_t elem_pitch = byte_pitch / sizeof(thrust::complex<float>);

    // Fill the input buffer with zeros to ensure that it represents valid integer and floating-point values at the same time.
    CUDA_CHECK_ERROR(cudaMemset(dev_in, 0, in_buf_bytes));

    // Execute the transform with and without callbacks.
    do_transform(dev_in, dev_out1, nfft, elem_pitch, batch, false);
    do_transform(dev_in, dev_out2, nfft, elem_pitch, batch, true);
}

int main()
{
    do_test(32768, 32);
}
	#include <cufftXt.h>
	#include <iostream>
	#include <stdint.h>
	#include <stdio.h>
	#include <thrust/complex.h>
	#include <vector>

	void check_error(cudaError_t e, const char *file=NULL, int line=0)
	{
	if (e != cudaSuccess) {
	std::cerr << "CUDA API error";
	if (file) std::cerr << " at " << file << ':' << line;
	std::cerr << ": " << cudaGetErrorString(e) << " ("
	<< cudaGetErrorName(e) << ')' << std::endl;
	exit(1);
	}
	}

	void check_error(cufftResult e, const char *file=NULL, int line=0)
	{
	if (e==CUFFT_SUCCESS)
	return;
	const char * em=NULL;
	switch (e)
	{
	case CUFFT_INVALID_PLAN:em="CUFFT_INVALID_PLAN";break;
	case CUFFT_ALLOC_FAILED:em="CUFFT_ALLOC_FAILED";break;
	case CUFFT_INVALID_TYPE:em="CUFFT_INVALID_TYPE";break;
	case CUFFT_INVALID_VALUE:em="CUFFT_INVALID_VALUE";break;
	case CUFFT_INTERNAL_ERROR:em="CUFFT_INTERNAL_ERROR";break;
	case CUFFT_EXEC_FAILED:em="CUFFT_EXEC_FAILED";break;
	case CUFFT_SETUP_FAILED:em="CUFFT_SETUP_FAILED";break;
	case CUFFT_INVALID_SIZE:em="CUFFT_INVALID_SIZE";break;
	case CUFFT_UNALIGNED_DATA:em="CUFFT_UNALIGNED_DATA";break;
	case CUFFT_INVALID_DEVICE:em="CUFFT_INVALID_DEVICE";break;
	case CUFFT_NO_WORKSPACE:em="CUFFT_NO_WORKSPACE";break;
	case CUFFT_NOT_IMPLEMENTED:em="CUFFT_NOT_IMPLEMENTED";break;
	case CUFFT_LICENSE_ERROR:em="CUFFT_LICENSE_ERROR";break;
	default:em="";break;
	}
	std::cerr << "CUFFT API error (" << e << ")" << em;
	if (file) std::cerr << " at " << file << ':' << line;
	std::cerr << std::endl;
	exit(1);
	}

	#define CUDA_CHECK_ERROR(e) check_error(e,__FILE__,__LINE__)

	// Simple callback function that treats the input buffer as 16-bit integers and converts to cufftReal.
	__device__ cufftReal callback(void * inbuf, size_t fft_index, void , void )
	{
	return reinterpret_cast<short *>(inbuf)[fft_index];
	}

	// Callback function pointers on the device.
	__device__ cufftCallbackLoadR d_callback = callback;

	void do_transform(cufftReal dev_in, thrust::complex<float> dev_out, int nfft, int elem_pitch, int batch, bool use_callback)
	{
	std::cout << "doing transform; callbacks: " << use_callback << "...";
	// Create the FFT plan.
	cufftHandle plan;
	CUDA_CHECK_ERROR(cufftPlanMany(&plan, 1, &nfft, &nfft, 1, nfft, &nfft, 1, elem_pitch, CUFFT_R2C, batch));

	// Attach the callback if needed.
	if (use_callback)
	{
	cufftCallbackLoadR load_callback_ptr;
	CUDA_CHECK_ERROR(cudaMemcpyFromSymbol(&load_callback_ptr, d_callback, sizeof(load_callback_ptr)));
	CUDA_CHECK_ERROR(cufftXtSetCallback(plan, (void **) &load_callback_ptr, CUFFT_CB_LD_REAL, NULL));
	}

	// Create a stream to run the transforms on.
	cudaStream_t stream;
	cudaStreamCreate(&stream);
	cufftSetStream(plan, stream);

	// Create events to measure the time taken for the transform.
	cudaEvent_t start, stop;
	CUDA_CHECK_ERROR(cudaEventCreate(&start));
	CUDA_CHECK_ERROR(cudaEventCreate(&stop));

	// Execute the plan.
	CUDA_CHECK_ERROR(cudaEventRecord(start, stream));
	for (int i = 0; i < 32; ++i)
	{
	CUDA_CHECK_ERROR(cufftExecR2C(plan, (cufftReal ) dev_in, (cufftComplex ) dev_out));
	}
	CUDA_CHECK_ERROR(cudaEventRecord(stop, stream));
	CUDA_CHECK_ERROR(cudaStreamSynchronize(stream));

	// Clean up the plan.
	CUDA_CHECK_ERROR(cufftDestroy(plan));
	float time = 0;
	CUDA_CHECK_ERROR(cudaEventElapsedTime(&time, start, stop));
	std::cout << "done; time: " << time << " msec" << std::endl;
	}

	void do_test(int nfft, int batch)
	{
	// Allocate input/output buffers on the device.
	cufftReal *dev_in;
	thrust::complex<float> dev_out1, dev_out2;
	size_t in_buf_elem = nfft * batch;
	size_t in_buf_bytes = in_buf_elem * sizeof(cufftReal);
	CUDA_CHECK_ERROR(cudaMalloc(&dev_in, in_buf_bytes));

	size_t out_row_len = nfft / 2 + 1; // number of elements per output transform
	size_t byte_pitch;
	CUDA_CHECK_ERROR(cudaMallocPitch(&dev_out1, &byte_pitch, out_row_len * sizeof(thrust::complex<float>), batch));
	CUDA_CHECK_ERROR(cudaMallocPitch(&dev_out2, &byte_pitch, out_row_len * sizeof(thrust::complex<float>), batch));
	size_t elem_pitch = byte_pitch / sizeof(thrust::complex<float>);

	// Fill the input buffer with zeros to ensure that it represents valid integer and floating-point values at the same time.
	CUDA_CHECK_ERROR(cudaMemset(dev_in, 0, in_buf_bytes));

	// Execute the transform with and without callbacks.
	do_transform(dev_in, dev_out1, nfft, elem_pitch, batch, false);
	do_transform(dev_in, dev_out2, nfft, elem_pitch, batch, true);
	}

	int main()
	{
	do_test(32768, 32);
	}