Last active
November 4, 2018 15:19
-
-
Save princewang1994/bf856771d28d3a71885be2fa25dd14a1 to your computer and use it in GitHub Desktop.
简单的cuda矩阵相乘代码
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include <cuda_runtime.h> | |
#include <helper_cuda.h> | |
#include <iostream> | |
struct Matrix | |
{ | |
int width; | |
int height; | |
float *elements; | |
}; | |
// 获取矩阵A的(row, col)元素 | |
__device__ float getElement(Matrix *A, int row, int col) | |
{ | |
return A->elements[row * A->width + col]; | |
} | |
// 为矩阵A的(row, col)元素赋值 | |
__device__ void setElement(Matrix *A, int row, int col, float value) | |
{ | |
A->elements[row * A->width + col] = value; | |
} | |
// 矩阵相乘kernel,2-D,每个线程计算一个元素 | |
__global__ void matMulKernel(Matrix *A, Matrix *B, Matrix *C) | |
{ | |
float Cvalue = 0.0; | |
int H = C->height, W = C->width; | |
int idx = threadIdx.x + blockIdx.x * blockDim.x; | |
int h = idx / W; | |
int w = idx % W; | |
if(h < H && w < W){ | |
for (int i = 0; i < A->width; ++i) | |
{ | |
Cvalue += getElement(A, h, i) * getElement(B, i, w); | |
} | |
setElement(C, h, w, Cvalue); | |
} | |
} | |
void showMat(Matrix *m){ | |
for(int i = 0; i < m -> height; i++){ | |
for(int j = 0; j < m -> width; j++){ | |
int idx = i * m->width + j; | |
std::cout << m->elements[idx] << " "; | |
} | |
std::cout << std::endl; | |
} | |
} | |
int main() | |
{ | |
int m = 10, k=4, n=5; | |
Matrix *A, *B, *C; | |
// 申请托管内存,不需要手动cudaCpyMemory,也不需要手动把数据在device和host之间来回移动 | |
cudaMallocManaged((void**)&A, sizeof(Matrix)); | |
cudaMallocManaged((void**)&B, sizeof(Matrix)); | |
cudaMallocManaged((void**)&C, sizeof(Matrix)); | |
cudaMallocManaged((void**)&A->elements, m * k * sizeof(float)); | |
cudaMallocManaged((void**)&B->elements, k * n * sizeof(float)); | |
cudaMallocManaged((void**)&C->elements, m * n * sizeof(float)); | |
// 初始化数据 A(m x k), B(k x n), C(m x n) | |
A->height = m; A->width = k; | |
B->height = k; B->width = n; | |
C->height = m; C->width = n; | |
for (int i = 0; i < m * k; ++i) | |
{ | |
A->elements[i] = 1.0; | |
} | |
for (int i = 0; i < k * n; ++i) | |
{ | |
B->elements[i] = 2.0; | |
} | |
// 定义block size | |
int blockSize=8; | |
int gridSize = (m * n + blockSize - 1) / blockSize; | |
// 执行kernel | |
matMulKernel <<< gridSize, blockSize >>>(A, B, C); | |
// 同步device,确保所有线程都同步执行到这里 | |
cudaDeviceSynchronize(); | |
std::cout << "Matrix A:" << std::endl; | |
showMat(A); | |
std::cout << "Matrix B:" << std::endl; | |
showMat(B); | |
std::cout << "Matrix C:" << std::endl; | |
showMat(C); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment