onionmk2/use_cudaMallocManaged.cu

## use_cudaMallocManaged.cu
#include <cuda_runtime.h>
#include <iostream>
#include <device_launch_parameters.h>
#include "cuda_error_detector.h"
#include <vector>


#ifdef __CUDACC__
#define KERNEL_ARGS2(numerOfBlocks, threadsPerBlock) <<< numerOfBlocks, threadsPerBlock >>>
#define KERNEL_ARGS3(numerOfBlocks, threadsPerBlock, sh_mem) <<< numerOfBlocks, threadsPerBlock, sh_mem >>>
#define KERNEL_ARGS4(numerOfBlocks, threadsPerBlock, sh_mem, stream) <<< numerOfBlocks, threadsPerBlock, sh_mem, stream >>>
#else
#define KERNEL_ARGS2(numerOfBlocks, threadsPerBlock)
#define KERNEL_ARGS3(numerOfBlocks, threadsPerBlock, sh_mem)
#define KERNEL_ARGS4(numerOfBlocks, threadsPerBlock, sh_mem, stream)
#endif

namespace add_loop_blocks
{
	const int number_of_blocks = 100;
	const int threads_per_block = 2;
	const int number_of_threads = number_of_blocks * threads_per_block;
	const int number_of_data = 654321;

	__device__ int get_id()
	{
		int block_id = blockIdx.z * (gridDim.x * gridDim.y)
			+ blockIdx.y * (gridDim.x)
			+ blockIdx.x;
		int threadId = block_id * (blockDim.x * blockDim.y * blockDim.z)
			+ (threadIdx.z * (blockDim.x * blockDim.y))
			+ (threadIdx.y * blockDim.x)
			+ threadIdx.x;
		return threadId;
	}


	__global__ void add(const int* a, const int* b, int *c)
	{
		int id = get_id();
		while (id < number_of_data)
		{
			c[id] = a[id] + b[id];
			id += number_of_threads;
		}
	}

	int main_add_loop_blocks(void)
	{
		const auto sizes = number_of_data * sizeof(int);
		int* a = nullptr;
		int* b = nullptr;
		int* c = nullptr;

		//allocate managed memory, which is used by cpu and gpu
		{
			CudaSafeCall(cudaMallocManaged(&a, sizes));
			CudaSafeCall(cudaMallocManaged(&b, sizes));
			CudaSafeCall(cudaMallocManaged(&c, sizes));
		}

		//assign initial values. cpu uses managed memory.
		for (int i = 0; i < number_of_data; ++i)
		{
			a[i] = i;
			b[i] = i + 1;
		}

		//add. gpu uses managed memory.
		add_loop_blocks::add KERNEL_ARGS2(number_of_blocks, threads_per_block)(a, b, c);

		// wait
		CudaSafeCall(cudaDeviceSynchronize());

// if you want to show all variable, comment out.
//		for (int i = 0; i < number_of_data; ++i)
//		{
//			std::cout << i << " is " << a[i] << "   " << b[i] << "   " << c[i] << "\n";
//		}

		// show only last item.
		const auto i = number_of_data - 1;
		std::cout << i << " is " << a[i] << "   " << b[i] << "   " << c[i] << "\n";

		//free
		{
			CudaSafeCall(cudaFree(a));
			CudaSafeCall(cudaFree(b));
			CudaSafeCall(cudaFree(c));
		}

		return 0;
	}
}


int main()
{
	add_loop_blocks::main_add_loop_blocks();
	return 0;
}
	#include <cuda_runtime.h>
	#include <iostream>
	#include <device_launch_parameters.h>
	#include "cuda_error_detector.h"
	#include <vector>


	#ifdef __CUDACC__
	#define KERNEL_ARGS2(numerOfBlocks, threadsPerBlock) <<< numerOfBlocks, threadsPerBlock >>>
	#define KERNEL_ARGS3(numerOfBlocks, threadsPerBlock, sh_mem) <<< numerOfBlocks, threadsPerBlock, sh_mem >>>
	#define KERNEL_ARGS4(numerOfBlocks, threadsPerBlock, sh_mem, stream) <<< numerOfBlocks, threadsPerBlock, sh_mem, stream >>>
	#else
	#define KERNEL_ARGS2(numerOfBlocks, threadsPerBlock)
	#define KERNEL_ARGS3(numerOfBlocks, threadsPerBlock, sh_mem)
	#define KERNEL_ARGS4(numerOfBlocks, threadsPerBlock, sh_mem, stream)
	#endif

	namespace add_loop_blocks
	{
	const int number_of_blocks = 100;
	const int threads_per_block = 2;
	const int number_of_threads = number_of_blocks * threads_per_block;
	const int number_of_data = 654321;

	__device__ int get_id()
	{
	int block_id = blockIdx.z * (gridDim.x * gridDim.y)
	+ blockIdx.y * (gridDim.x)
	+ blockIdx.x;
	int threadId = block_id * (blockDim.x * blockDim.y * blockDim.z)
	+ (threadIdx.z * (blockDim.x * blockDim.y))
	+ (threadIdx.y * blockDim.x)
	+ threadIdx.x;
	return threadId;
	}


	__global__ void add(const int* a, const int* b, int *c)
	{
	int id = get_id();
	while (id < number_of_data)
	{
	c[id] = a[id] + b[id];
	id += number_of_threads;
	}
	}

	int main_add_loop_blocks(void)
	{
	const auto sizes = number_of_data * sizeof(int);
	int* a = nullptr;
	int* b = nullptr;
	int* c = nullptr;

	//allocate managed memory, which is used by cpu and gpu
	{
	CudaSafeCall(cudaMallocManaged(&a, sizes));
	CudaSafeCall(cudaMallocManaged(&b, sizes));
	CudaSafeCall(cudaMallocManaged(&c, sizes));
	}

	//assign initial values. cpu uses managed memory.
	for (int i = 0; i < number_of_data; ++i)
	{
	a[i] = i;
	b[i] = i + 1;
	}

	//add. gpu uses managed memory.
	add_loop_blocks::add KERNEL_ARGS2(number_of_blocks, threads_per_block)(a, b, c);

	// wait
	CudaSafeCall(cudaDeviceSynchronize());

	// if you want to show all variable, comment out.
	// for (int i = 0; i < number_of_data; ++i)
	// {
	// std::cout << i << " is " << a[i] << " " << b[i] << " " << c[i] << "\n";
	// }

	// show only last item.
	const auto i = number_of_data - 1;
	std::cout << i << " is " << a[i] << " " << b[i] << " " << c[i] << "\n";

	//free
	{
	CudaSafeCall(cudaFree(a));
	CudaSafeCall(cudaFree(b));
	CudaSafeCall(cudaFree(c));
	}

	return 0;
	}
	}


	int main()
	{
	add_loop_blocks::main_add_loop_blocks();
	return 0;
	}