Skip to content

Instantly share code, notes, and snippets.

@kezunlin
Last active November 22, 2018 02:25
Show Gist options
  • Save kezunlin/73a82cad86e920bb300b042c1f7002d1 to your computer and use it in GitHub Desktop.
Save kezunlin/73a82cad86e920bb300b042c1f7002d1 to your computer and use it in GitHub Desktop.
cuda demo 1-dim vector add, 2-dim matrix multiply
#include <stdlib.h>
#include <iostream>
#include <cuda_runtime.h>
using namespace std;
/*
https://blog.csdn.net/fb_help/article/details/79330815
foo.cuh + foo.cu
*/
// function to add the elements of two arrays on CPU
void add(int n, float *a, float *b, float *c)
{
for (int i = 0; i < n; i++)
c[i] = a[i] + b[i];
}
// kernel on GPU
__global__ void kernel_add(int n, float *a, float *b, float *c)
{
// thread id
int i = blockDim.x * blockIdx.x + threadIdx.x;
c[i] = a[i] + b[i];
}
__global__ void kernel_add2(int n, float *a, float *b, float *c)
{
// thread id
int index = blockDim.x * blockIdx.x + threadIdx.x;
// grid-stride loop
int grid_stride = blockDim.x * gridDim.x; // 256*4096
// in this case; only 1 loop
for (int i = index; i < n; i += grid_stride)
{
c[i] = a[i] + b[i];
}
}
void device_info()
{
int deviceCount;
cudaGetDeviceCount(&deviceCount);
for (int i = 0;i<deviceCount;i++)
{
cudaDeviceProp devProp;
cudaGetDeviceProperties(&devProp, i);
std::cout << "使用GPU device " << i << ": " << devProp.name << std::endl;
std::cout << "设备全局内存总量: " << devProp.totalGlobalMem / 1024 / 1024 << "MB" << std::endl;
std::cout << "SM的数量:" << devProp.multiProcessorCount << std::endl;
std::cout << "每个SM的最大线程数:" << devProp.maxThreadsPerMultiProcessor << std::endl;
std::cout << "每个SM的最大线程束数(warps):" << devProp.maxThreadsPerMultiProcessor / 32 << std::endl;
std::cout << "每个线程块(Block)的共享内存大小:" << devProp.sharedMemPerBlock / 1024.0 << " KB" << std::endl;
std::cout << "每个线程块(Block)的最大线程数:" << devProp.maxThreadsPerBlock << std::endl;
std::cout << "每个线程块(Block)可用的32位寄存器数量: " << devProp.regsPerBlock << std::endl;
std::cout << "======================================================" << std::endl;
}
}
void test_cpu()
{
float *A, *B, *C;
int n = 1024 * 1024;
int size = n * sizeof(float);
// CPU端分配内存
A = (float*)malloc(size);
B = (float*)malloc(size);
C = (float*)malloc(size);
// 初始化数组
for (int i = 0;i<n;i++)
{
A[i] = 90.0;
B[i] = 10.0;
}
// Run kernel on 1M elements on the CPU
add(n, A, B, C);
// 校验误差
float max_error = 0.0;
for (int i = 0;i<n;i++)
{
max_error += fabs(100.0 - C[i]);
}
cout << "max error is " << max_error << endl;
// 释放CPU端的内存
free(A);
free(B);
free(C);
}
/*
cudaMalloc+cudaMemcpy+cudaFree
*/
int test_gpu_1()
{
float*A, *Ad, *B, *Bd, *C, *Cd;
int n = 1024 * 1024;
int size = n * sizeof(float);
// CPU端分配内存
A = (float*)malloc(size);
B = (float*)malloc(size);
C = (float*)malloc(size);
// 初始化数组
for(int i=0;i<n;i++)
{
A[i] = 90.0;
B[i] = 10.0;
}
// GPU端分配内存
cudaMalloc((void**)&Ad, size);
cudaMalloc((void**)&Bd, size);
cudaMalloc((void**)&Cd, size);
// CPU的数据拷贝到GPU端
cudaMemcpy(Ad, A, size, cudaMemcpyHostToDevice);
cudaMemcpy(Bd, B, size, cudaMemcpyHostToDevice);
cudaMemcpy(Bd, B, size, cudaMemcpyHostToDevice);
// 1-dim
// 定义kernel执行配置,(1024*1024/512)个block,每个block里面有512个线程
int block_size = 512;
int num_of_blocks = (n + block_size - 1) / block_size;
dim3 dimBlock(block_size);
dim3 dimGrid(num_of_blocks);
// 执行kernel
kernel_add<<<dimGrid, dimBlock>>>(n, Ad, Bd, Cd);
// 将在GPU端计算好的结果拷贝回CPU端
cudaMemcpy(C, Cd, size, cudaMemcpyDeviceToHost);
// 校验误差
float max_error = 0.0;
for(int i=0;i<n;i++)
{
max_error += fabs(100.0 - C[i]);
}
cout << "max error is " << max_error << endl;
// 释放CPU端、GPU端的内存
free(A);
free(B);
free(C);
cudaFree(Ad);
cudaFree(Bd);
cudaFree(Cd);
return 0;
}
/*
cudaMallocManaged+cudaDeviceSynchronize+cudaFree
*/
void test_gpu_2()
{
float*A, *B, *C;
int n = 1024 * 1024;
int size = n * sizeof(float);
// Allocate Unified Memory – accessible from CPU or GPU
cudaMallocManaged((void**)&A, size);
cudaMallocManaged((void**)&B, size);
cudaMallocManaged((void**)&C, size);
// 初始化数组
for (int i = 0;i<n;i++)
{
A[i] = 90.0;
B[i] = 10.0;
}
// 1-dim
// 定义kernel执行配置,(1024*1024/512)个block,每个block里面有512个线程
int block_size = 512;
int num_of_blocks = (n + block_size - 1) / block_size;
dim3 dimBlock(block_size);
dim3 dimGrid(num_of_blocks);
// 执行kernel
kernel_add2<<<dimGrid, dimBlock>>>(n, A, B, C);
// Wait for GPU to finish before accessing on host
cudaDeviceSynchronize(); // block until the GPU has finished all tasks
// 校验误差
float max_error = 0.0;
for (int i = 0;i<n;i++)
{
max_error += fabs(100.0 - C[i]);
}
cout << "max error is " << max_error << endl;
// Free Unified Memory
cudaFree(A);
cudaFree(B);
cudaFree(C);
}
// ========================================
// 2-dim
// ========================================
// 矩阵类型,行优先,M(row, col) = *(M.elements + row * M.width + col)
struct Matrix
{
int width;
int height;
float *elements;
};
// 获取矩阵A的(row, col)元素
__device__ float getElement(Matrix *A, int row, int col)
{
return A->elements[row * A->width + col];
}
// 为矩阵A的(row, col)元素赋值
__device__ void setElement(Matrix *A, int row, int col, float value)
{
A->elements[row * A->width + col] = value;
}
// 矩阵相乘kernel,2-D,每个线程计算一个元素
__global__ void matMulKernel(Matrix *A, Matrix *B, Matrix *C)
{
float sum = 0.0;
int row = threadIdx.y + blockIdx.y * blockDim.y;
int col = threadIdx.x + blockIdx.x * blockDim.x;
for (int i = 0; i < A->width; ++i)
{
sum += getElement(A, row, i) * getElement(B, i, col);
}
setElement(C, row, col, sum);
}
void test_gpu_3()
{
int width = 1 << 8;
int height = 1 << 8;
Matrix *A, *B, *C;
// 申请托管内存
cudaMallocManaged((void**)&A, sizeof(Matrix));
cudaMallocManaged((void**)&B, sizeof(Matrix));
cudaMallocManaged((void**)&C, sizeof(Matrix));
int nBytes = width * height * sizeof(float);
cudaMallocManaged((void**)&A->elements, nBytes);
cudaMallocManaged((void**)&B->elements, nBytes);
cudaMallocManaged((void**)&C->elements, nBytes);
// 初始化数据
A->height = height;
A->width = width;
B->height = height;
B->width = width;
C->height = height;
C->width = width;
for (int i = 0; i < width * height; ++i)
{
A->elements[i] = 1.0;
B->elements[i] = 2.0;
}
// 定义kernel的执行配置
dim3 blockSize(32, 32);
dim3 gridSize(
(width + blockSize.x - 1) / blockSize.x,
(height + blockSize.y - 1) / blockSize.y
);
// 执行kernel
matMulKernel<<<gridSize, blockSize>>>(A, B, C);
// 同步device 保证结果能正确访问
cudaDeviceSynchronize();
// 检查执行结果
float maxError = 0.0;
for (int i = 0; i < width * height; ++i)
maxError += fabs(C->elements[i] - 2 * width);
cout << "max error is " << maxError << endl;
// 释放托管内存
cudaFree(A->elements);
cudaFree(B->elements);
cudaFree(C->elements);
cudaFree(A);
cudaFree(B);
cudaFree(C);
}
int main()
{
/*
device_info();
test_cpu();
test_gpu_1();
test_gpu_2();
*/
test_gpu_3();
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment