Last active
January 8, 2018 11:21
-
-
Save onionmk2/0a4c466a41e0da2504a449bfa497b92a to your computer and use it in GitHub Desktop.
use_cudaMallocManaged.cu
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <cuda_runtime.h> | |
#include <iostream> | |
#include <device_launch_parameters.h> | |
#include "cuda_error_detector.h" | |
#include <vector> | |
#ifdef __CUDACC__ | |
#define KERNEL_ARGS2(numerOfBlocks, threadsPerBlock) <<< numerOfBlocks, threadsPerBlock >>> | |
#define KERNEL_ARGS3(numerOfBlocks, threadsPerBlock, sh_mem) <<< numerOfBlocks, threadsPerBlock, sh_mem >>> | |
#define KERNEL_ARGS4(numerOfBlocks, threadsPerBlock, sh_mem, stream) <<< numerOfBlocks, threadsPerBlock, sh_mem, stream >>> | |
#else | |
#define KERNEL_ARGS2(numerOfBlocks, threadsPerBlock) | |
#define KERNEL_ARGS3(numerOfBlocks, threadsPerBlock, sh_mem) | |
#define KERNEL_ARGS4(numerOfBlocks, threadsPerBlock, sh_mem, stream) | |
#endif | |
namespace add_loop_blocks | |
{ | |
const int number_of_blocks = 100; | |
const int threads_per_block = 2; | |
const int number_of_threads = number_of_blocks * threads_per_block; | |
const int number_of_data = 654321; | |
__device__ int get_id() | |
{ | |
int block_id = blockIdx.z * (gridDim.x * gridDim.y) | |
+ blockIdx.y * (gridDim.x) | |
+ blockIdx.x; | |
int threadId = block_id * (blockDim.x * blockDim.y * blockDim.z) | |
+ (threadIdx.z * (blockDim.x * blockDim.y)) | |
+ (threadIdx.y * blockDim.x) | |
+ threadIdx.x; | |
return threadId; | |
} | |
__global__ void add(const int* a, const int* b, int *c) | |
{ | |
int id = get_id(); | |
while (id < number_of_data) | |
{ | |
c[id] = a[id] + b[id]; | |
id += number_of_threads; | |
} | |
} | |
int main_add_loop_blocks(void) | |
{ | |
const auto sizes = number_of_data * sizeof(int); | |
int* a = nullptr; | |
int* b = nullptr; | |
int* c = nullptr; | |
//allocate managed memory, which is used by cpu and gpu | |
{ | |
CudaSafeCall(cudaMallocManaged(&a, sizes)); | |
CudaSafeCall(cudaMallocManaged(&b, sizes)); | |
CudaSafeCall(cudaMallocManaged(&c, sizes)); | |
} | |
//assign initial values. cpu uses managed memory. | |
for (int i = 0; i < number_of_data; ++i) | |
{ | |
a[i] = i; | |
b[i] = i + 1; | |
} | |
//add. gpu uses managed memory. | |
add_loop_blocks::add KERNEL_ARGS2(number_of_blocks, threads_per_block)(a, b, c); | |
// wait | |
CudaSafeCall(cudaDeviceSynchronize()); | |
// if you want to show all variable, comment out. | |
// for (int i = 0; i < number_of_data; ++i) | |
// { | |
// std::cout << i << " is " << a[i] << " " << b[i] << " " << c[i] << "\n"; | |
// } | |
// show only last item. | |
const auto i = number_of_data - 1; | |
std::cout << i << " is " << a[i] << " " << b[i] << " " << c[i] << "\n"; | |
//free | |
{ | |
CudaSafeCall(cudaFree(a)); | |
CudaSafeCall(cudaFree(b)); | |
CudaSafeCall(cudaFree(c)); | |
} | |
return 0; | |
} | |
} | |
int main() | |
{ | |
add_loop_blocks::main_add_loop_blocks(); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment