Imxset21/kernel.cu

## kernel.cu
/*
    Dynamically allocates memory for a device-side variable-length array.

    Primary purpose is to be able to use the device variable to access the
    allocated memory rather than having to keep track through function params.

    @author Pedro Rittner
    @email pedro.rittner@outlook.com
 */
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include <cuda_runtime.h>
#include <cuda_runtime_api.h>

// Utility macro to check for and print CUDA errors
#define CUDA_CHECK_RETURN(value) {                                          \
    cudaError_t _m_cudaStat = value;                                        \
    if (_m_cudaStat != cudaSuccess) {                                       \
        fprintf(stderr, "Error %s at line %d in file %s\n",                 \
                cudaGetErrorString(_m_cudaStat), __LINE__, __FILE__);       \
        exit(EXIT_FAILURE);                                                 \
    } }

//////////////////////////
// Device Symbol Malloc //
//////////////////////////

// Declare a device-side pointer to hold the data
__device__ float* my_float_ptr = NULL;

// Dummy test function, should output "my_float_ptr: 5.0"
__global__ void cuda_dev_malloc_test()
{
	printf("my_float_ptr: %f\n", my_float_ptr[0]);
}

int main(int argc, char const *argv[])
{
	float* host_float_ptr = NULL, host_float_val = 5.0;

	// First we allocate the space we need on the device,
	// and hold that pointer's value in host memory.
	CUDA_CHECK_RETURN( cudaMalloc((void**)&host_float_ptr, sizeof(float)) );

	// We copy the value (5.0) we want in the array to the just-allocated pointer on the device
	CUDA_CHECK_RETURN(cudaMemcpy(host_float_ptr, &host_float_val, sizeof(float), cudaMemcpyHostToDevice));

	// Finally, we copy the allocated pointer value to the on-device "symbol," basically making an alias
	CUDA_CHECK_RETURN(cudaMemcpyToSymbol(my_float_ptr, &host_float_ptr, sizeof(float*), size_t(0), cudaMemcpyHostToDevice));

	// CUDA boilerplate to run this in a single thread
	const int nThreads = 1;
	const int nBlocks = 1;
	dim3 dimGrid(nBlocks);
	dim3 dimBlock(nThreads);

	// Run the test function
	cuda_dev_malloc_test<<<dimGrid, dimBlock>>>();

	CUDA_CHECK_RETURN(cudaDeviceSynchronize());
	CUDA_CHECK_RETURN(cudaGetLastError());

	cudaDeviceReset();
	return EXIT_SUCCESS;
}
	/*
	Dynamically allocates memory for a device-side variable-length array.

	Primary purpose is to be able to use the device variable to access the
	allocated memory rather than having to keep track through function params.

	@author Pedro Rittner
	@email pedro.rittner@outlook.com
	*/
	#include <stdio.h>
	#include <stdlib.h>
	#include <string.h>

	#include <cuda_runtime.h>
	#include <cuda_runtime_api.h>

	// Utility macro to check for and print CUDA errors
	#define CUDA_CHECK_RETURN(value) { \
	cudaError_t _m_cudaStat = value; \
	if (_m_cudaStat != cudaSuccess) { \
	fprintf(stderr, "Error %s at line %d in file %s\n", \
	cudaGetErrorString(_m_cudaStat), __LINE__, __FILE__); \
	exit(EXIT_FAILURE); \
	} }

	//////////////////////////
	// Device Symbol Malloc //
	//////////////////////////

	// Declare a device-side pointer to hold the data
	__device__ float* my_float_ptr = NULL;

	// Dummy test function, should output "my_float_ptr: 5.0"
	__global__ void cuda_dev_malloc_test()
	{
	printf("my_float_ptr: %f\n", my_float_ptr[0]);
	}

	int main(int argc, char const *argv[])
	{
	float* host_float_ptr = NULL, host_float_val = 5.0;

	// First we allocate the space we need on the device,
	// and hold that pointer's value in host memory.
	CUDA_CHECK_RETURN( cudaMalloc((void**)&host_float_ptr, sizeof(float)) );

	// We copy the value (5.0) we want in the array to the just-allocated pointer on the device
	CUDA_CHECK_RETURN(cudaMemcpy(host_float_ptr, &host_float_val, sizeof(float), cudaMemcpyHostToDevice));

	// Finally, we copy the allocated pointer value to the on-device "symbol," basically making an alias
	CUDA_CHECK_RETURN(cudaMemcpyToSymbol(my_float_ptr, &host_float_ptr, sizeof(float*), size_t(0), cudaMemcpyHostToDevice));

	// CUDA boilerplate to run this in a single thread
	const int nThreads = 1;
	const int nBlocks = 1;
	dim3 dimGrid(nBlocks);
	dim3 dimBlock(nThreads);

	// Run the test function
	cuda_dev_malloc_test<<<dimGrid, dimBlock>>>();

	CUDA_CHECK_RETURN(cudaDeviceSynchronize());
	CUDA_CHECK_RETURN(cudaGetLastError());

	cudaDeviceReset();
	return EXIT_SUCCESS;
	}