Skip to content

Instantly share code, notes, and snippets.

@onionmk2
Last active January 8, 2018 11:21
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save onionmk2/0a4c466a41e0da2504a449bfa497b92a to your computer and use it in GitHub Desktop.
Save onionmk2/0a4c466a41e0da2504a449bfa497b92a to your computer and use it in GitHub Desktop.
use_cudaMallocManaged.cu
#include <cuda_runtime.h>
#include <iostream>
#include <device_launch_parameters.h>
#include "cuda_error_detector.h"
#include <vector>
#ifdef __CUDACC__
#define KERNEL_ARGS2(numerOfBlocks, threadsPerBlock) <<< numerOfBlocks, threadsPerBlock >>>
#define KERNEL_ARGS3(numerOfBlocks, threadsPerBlock, sh_mem) <<< numerOfBlocks, threadsPerBlock, sh_mem >>>
#define KERNEL_ARGS4(numerOfBlocks, threadsPerBlock, sh_mem, stream) <<< numerOfBlocks, threadsPerBlock, sh_mem, stream >>>
#else
#define KERNEL_ARGS2(numerOfBlocks, threadsPerBlock)
#define KERNEL_ARGS3(numerOfBlocks, threadsPerBlock, sh_mem)
#define KERNEL_ARGS4(numerOfBlocks, threadsPerBlock, sh_mem, stream)
#endif
namespace add_loop_blocks
{
const int number_of_blocks = 100;
const int threads_per_block = 2;
const int number_of_threads = number_of_blocks * threads_per_block;
const int number_of_data = 654321;
__device__ int get_id()
{
int block_id = blockIdx.z * (gridDim.x * gridDim.y)
+ blockIdx.y * (gridDim.x)
+ blockIdx.x;
int threadId = block_id * (blockDim.x * blockDim.y * blockDim.z)
+ (threadIdx.z * (blockDim.x * blockDim.y))
+ (threadIdx.y * blockDim.x)
+ threadIdx.x;
return threadId;
}
__global__ void add(const int* a, const int* b, int *c)
{
int id = get_id();
while (id < number_of_data)
{
c[id] = a[id] + b[id];
id += number_of_threads;
}
}
int main_add_loop_blocks(void)
{
const auto sizes = number_of_data * sizeof(int);
int* a = nullptr;
int* b = nullptr;
int* c = nullptr;
//allocate managed memory, which is used by cpu and gpu
{
CudaSafeCall(cudaMallocManaged(&a, sizes));
CudaSafeCall(cudaMallocManaged(&b, sizes));
CudaSafeCall(cudaMallocManaged(&c, sizes));
}
//assign initial values. cpu uses managed memory.
for (int i = 0; i < number_of_data; ++i)
{
a[i] = i;
b[i] = i + 1;
}
//add. gpu uses managed memory.
add_loop_blocks::add KERNEL_ARGS2(number_of_blocks, threads_per_block)(a, b, c);
// wait
CudaSafeCall(cudaDeviceSynchronize());
// if you want to show all variable, comment out.
// for (int i = 0; i < number_of_data; ++i)
// {
// std::cout << i << " is " << a[i] << " " << b[i] << " " << c[i] << "\n";
// }
// show only last item.
const auto i = number_of_data - 1;
std::cout << i << " is " << a[i] << " " << b[i] << " " << c[i] << "\n";
//free
{
CudaSafeCall(cudaFree(a));
CudaSafeCall(cudaFree(b));
CudaSafeCall(cudaFree(c));
}
return 0;
}
}
int main()
{
add_loop_blocks::main_add_loop_blocks();
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment