Last active
November 22, 2018 02:25
-
-
Save kezunlin/73a82cad86e920bb300b042c1f7002d1 to your computer and use it in GitHub Desktop.
cuda demo 1-dim vector add, 2-dim matrix multiply
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdlib.h> | |
#include <iostream> | |
#include <cuda_runtime.h> | |
using namespace std; | |
/* | |
https://blog.csdn.net/fb_help/article/details/79330815 | |
foo.cuh + foo.cu | |
*/ | |
// function to add the elements of two arrays on CPU | |
void add(int n, float *a, float *b, float *c) | |
{ | |
for (int i = 0; i < n; i++) | |
c[i] = a[i] + b[i]; | |
} | |
// kernel on GPU | |
__global__ void kernel_add(int n, float *a, float *b, float *c) | |
{ | |
// thread id | |
int i = blockDim.x * blockIdx.x + threadIdx.x; | |
c[i] = a[i] + b[i]; | |
} | |
__global__ void kernel_add2(int n, float *a, float *b, float *c) | |
{ | |
// thread id | |
int index = blockDim.x * blockIdx.x + threadIdx.x; | |
// grid-stride loop | |
int grid_stride = blockDim.x * gridDim.x; // 256*4096 | |
// in this case; only 1 loop | |
for (int i = index; i < n; i += grid_stride) | |
{ | |
c[i] = a[i] + b[i]; | |
} | |
} | |
void device_info() | |
{ | |
int deviceCount; | |
cudaGetDeviceCount(&deviceCount); | |
for (int i = 0;i<deviceCount;i++) | |
{ | |
cudaDeviceProp devProp; | |
cudaGetDeviceProperties(&devProp, i); | |
std::cout << "使用GPU device " << i << ": " << devProp.name << std::endl; | |
std::cout << "设备全局内存总量: " << devProp.totalGlobalMem / 1024 / 1024 << "MB" << std::endl; | |
std::cout << "SM的数量:" << devProp.multiProcessorCount << std::endl; | |
std::cout << "每个SM的最大线程数:" << devProp.maxThreadsPerMultiProcessor << std::endl; | |
std::cout << "每个SM的最大线程束数(warps):" << devProp.maxThreadsPerMultiProcessor / 32 << std::endl; | |
std::cout << "每个线程块(Block)的共享内存大小:" << devProp.sharedMemPerBlock / 1024.0 << " KB" << std::endl; | |
std::cout << "每个线程块(Block)的最大线程数:" << devProp.maxThreadsPerBlock << std::endl; | |
std::cout << "每个线程块(Block)可用的32位寄存器数量: " << devProp.regsPerBlock << std::endl; | |
std::cout << "======================================================" << std::endl; | |
} | |
} | |
void test_cpu() | |
{ | |
float *A, *B, *C; | |
int n = 1024 * 1024; | |
int size = n * sizeof(float); | |
// CPU端分配内存 | |
A = (float*)malloc(size); | |
B = (float*)malloc(size); | |
C = (float*)malloc(size); | |
// 初始化数组 | |
for (int i = 0;i<n;i++) | |
{ | |
A[i] = 90.0; | |
B[i] = 10.0; | |
} | |
// Run kernel on 1M elements on the CPU | |
add(n, A, B, C); | |
// 校验误差 | |
float max_error = 0.0; | |
for (int i = 0;i<n;i++) | |
{ | |
max_error += fabs(100.0 - C[i]); | |
} | |
cout << "max error is " << max_error << endl; | |
// 释放CPU端的内存 | |
free(A); | |
free(B); | |
free(C); | |
} | |
/* | |
cudaMalloc+cudaMemcpy+cudaFree | |
*/ | |
int test_gpu_1() | |
{ | |
float*A, *Ad, *B, *Bd, *C, *Cd; | |
int n = 1024 * 1024; | |
int size = n * sizeof(float); | |
// CPU端分配内存 | |
A = (float*)malloc(size); | |
B = (float*)malloc(size); | |
C = (float*)malloc(size); | |
// 初始化数组 | |
for(int i=0;i<n;i++) | |
{ | |
A[i] = 90.0; | |
B[i] = 10.0; | |
} | |
// GPU端分配内存 | |
cudaMalloc((void**)&Ad, size); | |
cudaMalloc((void**)&Bd, size); | |
cudaMalloc((void**)&Cd, size); | |
// CPU的数据拷贝到GPU端 | |
cudaMemcpy(Ad, A, size, cudaMemcpyHostToDevice); | |
cudaMemcpy(Bd, B, size, cudaMemcpyHostToDevice); | |
cudaMemcpy(Bd, B, size, cudaMemcpyHostToDevice); | |
// 1-dim | |
// 定义kernel执行配置,(1024*1024/512)个block,每个block里面有512个线程 | |
int block_size = 512; | |
int num_of_blocks = (n + block_size - 1) / block_size; | |
dim3 dimBlock(block_size); | |
dim3 dimGrid(num_of_blocks); | |
// 执行kernel | |
kernel_add<<<dimGrid, dimBlock>>>(n, Ad, Bd, Cd); | |
// 将在GPU端计算好的结果拷贝回CPU端 | |
cudaMemcpy(C, Cd, size, cudaMemcpyDeviceToHost); | |
// 校验误差 | |
float max_error = 0.0; | |
for(int i=0;i<n;i++) | |
{ | |
max_error += fabs(100.0 - C[i]); | |
} | |
cout << "max error is " << max_error << endl; | |
// 释放CPU端、GPU端的内存 | |
free(A); | |
free(B); | |
free(C); | |
cudaFree(Ad); | |
cudaFree(Bd); | |
cudaFree(Cd); | |
return 0; | |
} | |
/* | |
cudaMallocManaged+cudaDeviceSynchronize+cudaFree | |
*/ | |
void test_gpu_2() | |
{ | |
float*A, *B, *C; | |
int n = 1024 * 1024; | |
int size = n * sizeof(float); | |
// Allocate Unified Memory – accessible from CPU or GPU | |
cudaMallocManaged((void**)&A, size); | |
cudaMallocManaged((void**)&B, size); | |
cudaMallocManaged((void**)&C, size); | |
// 初始化数组 | |
for (int i = 0;i<n;i++) | |
{ | |
A[i] = 90.0; | |
B[i] = 10.0; | |
} | |
// 1-dim | |
// 定义kernel执行配置,(1024*1024/512)个block,每个block里面有512个线程 | |
int block_size = 512; | |
int num_of_blocks = (n + block_size - 1) / block_size; | |
dim3 dimBlock(block_size); | |
dim3 dimGrid(num_of_blocks); | |
// 执行kernel | |
kernel_add2<<<dimGrid, dimBlock>>>(n, A, B, C); | |
// Wait for GPU to finish before accessing on host | |
cudaDeviceSynchronize(); // block until the GPU has finished all tasks | |
// 校验误差 | |
float max_error = 0.0; | |
for (int i = 0;i<n;i++) | |
{ | |
max_error += fabs(100.0 - C[i]); | |
} | |
cout << "max error is " << max_error << endl; | |
// Free Unified Memory | |
cudaFree(A); | |
cudaFree(B); | |
cudaFree(C); | |
} | |
// ======================================== | |
// 2-dim | |
// ======================================== | |
// 矩阵类型,行优先,M(row, col) = *(M.elements + row * M.width + col) | |
struct Matrix | |
{ | |
int width; | |
int height; | |
float *elements; | |
}; | |
// 获取矩阵A的(row, col)元素 | |
__device__ float getElement(Matrix *A, int row, int col) | |
{ | |
return A->elements[row * A->width + col]; | |
} | |
// 为矩阵A的(row, col)元素赋值 | |
__device__ void setElement(Matrix *A, int row, int col, float value) | |
{ | |
A->elements[row * A->width + col] = value; | |
} | |
// 矩阵相乘kernel,2-D,每个线程计算一个元素 | |
__global__ void matMulKernel(Matrix *A, Matrix *B, Matrix *C) | |
{ | |
float sum = 0.0; | |
int row = threadIdx.y + blockIdx.y * blockDim.y; | |
int col = threadIdx.x + blockIdx.x * blockDim.x; | |
for (int i = 0; i < A->width; ++i) | |
{ | |
sum += getElement(A, row, i) * getElement(B, i, col); | |
} | |
setElement(C, row, col, sum); | |
} | |
void test_gpu_3() | |
{ | |
int width = 1 << 8; | |
int height = 1 << 8; | |
Matrix *A, *B, *C; | |
// 申请托管内存 | |
cudaMallocManaged((void**)&A, sizeof(Matrix)); | |
cudaMallocManaged((void**)&B, sizeof(Matrix)); | |
cudaMallocManaged((void**)&C, sizeof(Matrix)); | |
int nBytes = width * height * sizeof(float); | |
cudaMallocManaged((void**)&A->elements, nBytes); | |
cudaMallocManaged((void**)&B->elements, nBytes); | |
cudaMallocManaged((void**)&C->elements, nBytes); | |
// 初始化数据 | |
A->height = height; | |
A->width = width; | |
B->height = height; | |
B->width = width; | |
C->height = height; | |
C->width = width; | |
for (int i = 0; i < width * height; ++i) | |
{ | |
A->elements[i] = 1.0; | |
B->elements[i] = 2.0; | |
} | |
// 定义kernel的执行配置 | |
dim3 blockSize(32, 32); | |
dim3 gridSize( | |
(width + blockSize.x - 1) / blockSize.x, | |
(height + blockSize.y - 1) / blockSize.y | |
); | |
// 执行kernel | |
matMulKernel<<<gridSize, blockSize>>>(A, B, C); | |
// 同步device 保证结果能正确访问 | |
cudaDeviceSynchronize(); | |
// 检查执行结果 | |
float maxError = 0.0; | |
for (int i = 0; i < width * height; ++i) | |
maxError += fabs(C->elements[i] - 2 * width); | |
cout << "max error is " << maxError << endl; | |
// 释放托管内存 | |
cudaFree(A->elements); | |
cudaFree(B->elements); | |
cudaFree(C->elements); | |
cudaFree(A); | |
cudaFree(B); | |
cudaFree(C); | |
} | |
int main() | |
{ | |
/* | |
device_info(); | |
test_cpu(); | |
test_gpu_1(); | |
test_gpu_2(); | |
*/ | |
test_gpu_3(); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment