Skip to content

Instantly share code, notes, and snippets.

@princewang1994
Last active November 4, 2018 15:19
Show Gist options
  • Save princewang1994/bf856771d28d3a71885be2fa25dd14a1 to your computer and use it in GitHub Desktop.
Save princewang1994/bf856771d28d3a71885be2fa25dd14a1 to your computer and use it in GitHub Desktop.
简单的cuda矩阵相乘代码
#include <stdio.h>
#include <cuda_runtime.h>
#include <helper_cuda.h>
#include <iostream>
struct Matrix
{
int width;
int height;
float *elements;
};
// 获取矩阵A的(row, col)元素
__device__ float getElement(Matrix *A, int row, int col)
{
return A->elements[row * A->width + col];
}
// 为矩阵A的(row, col)元素赋值
__device__ void setElement(Matrix *A, int row, int col, float value)
{
A->elements[row * A->width + col] = value;
}
// 矩阵相乘kernel,2-D,每个线程计算一个元素
__global__ void matMulKernel(Matrix *A, Matrix *B, Matrix *C)
{
float Cvalue = 0.0;
int H = C->height, W = C->width;
int idx = threadIdx.x + blockIdx.x * blockDim.x;
int h = idx / W;
int w = idx % W;
if(h < H && w < W){
for (int i = 0; i < A->width; ++i)
{
Cvalue += getElement(A, h, i) * getElement(B, i, w);
}
setElement(C, h, w, Cvalue);
}
}
void showMat(Matrix *m){
for(int i = 0; i < m -> height; i++){
for(int j = 0; j < m -> width; j++){
int idx = i * m->width + j;
std::cout << m->elements[idx] << " ";
}
std::cout << std::endl;
}
}
int main()
{
int m = 10, k=4, n=5;
Matrix *A, *B, *C;
// 申请托管内存,不需要手动cudaCpyMemory,也不需要手动把数据在device和host之间来回移动
cudaMallocManaged((void**)&A, sizeof(Matrix));
cudaMallocManaged((void**)&B, sizeof(Matrix));
cudaMallocManaged((void**)&C, sizeof(Matrix));
cudaMallocManaged((void**)&A->elements, m * k * sizeof(float));
cudaMallocManaged((void**)&B->elements, k * n * sizeof(float));
cudaMallocManaged((void**)&C->elements, m * n * sizeof(float));
// 初始化数据 A(m x k), B(k x n), C(m x n)
A->height = m; A->width = k;
B->height = k; B->width = n;
C->height = m; C->width = n;
for (int i = 0; i < m * k; ++i)
{
A->elements[i] = 1.0;
}
for (int i = 0; i < k * n; ++i)
{
B->elements[i] = 2.0;
}
// 定义block size
int blockSize=8;
int gridSize = (m * n + blockSize - 1) / blockSize;
// 执行kernel
matMulKernel <<< gridSize, blockSize >>>(A, B, C);
// 同步device,确保所有线程都同步执行到这里
cudaDeviceSynchronize();
std::cout << "Matrix A:" << std::endl;
showMat(A);
std::cout << "Matrix B:" << std::endl;
showMat(B);
std::cout << "Matrix C:" << std::endl;
showMat(C);
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment