Skip to content

Instantly share code, notes, and snippets.

@omarsamir27
Created January 21, 2023 00:29
Show Gist options
  • Save omarsamir27/6d2ec3663a5d68574d34163127e8ba86 to your computer and use it in GitHub Desktop.
Save omarsamir27/6d2ec3663a5d68574d34163127e8ba86 to your computer and use it in GitHub Desktop.
Example code to demonstrate matrix multiplication of any appropriate dimensions using CUDA
#include <iostream>
/* Example code to demonstrate matrix multiplication of any appropriate dimensions using CUDA
* Matrix Multiplication is a routine operation in any BLAS Library , see https://docs.nvidia.com/cuda/cublas/index.html
* Before working with CUDA , it is advisable to go over the CUDA Documentation :
* https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html
* Author : Omar Mohamed <omarmsamir27@gmail.com> <github.com/omarsamir27>
* for HPC Centre in Alexandria Library <https://hpc.bibalex.org> <github.com/hpcalex>
*/
__global__
void multiply(int* A,int2 Adims,int* B,int2 Bdims,int *ANS,int2 ANSdims){
int idx = blockIdx.x * blockDim.x + threadIdx.x ; // Thread ID in all blocks
if (idx >= ANSdims.x * ANSdims.y ) return; // prevent extra threads from touching memory
/* Convert linear indices to matrix subscripts and multiply matrices, each thread is responsible for the
* element whose index matches its Thread ID*/
unsigned int row = idx / ANSdims.x;
unsigned int col = idx % ANSdims.y;
unsigned int N = ANSdims.x * ANSdims.y;
int sum = 0;
for(int i =0;i<N;++i){
sum += A[row*Adims.y + i] * B[i*Bdims.y + col];
}
ANS[row*ANSdims.y + col] = sum;
}
//pretty print in matrix form
void pprint(int* mat,int2 dims){
for (auto i=0;i<dims.x;++i){
for (auto j=0;j<dims.y;++j){
std::cout << mat[i*dims.y + j] << '\t' ;
}
std::cout << std::endl;
}
std::cout << std::endl;
}
inline int rand_inrange(int min,int max){
return min + rand() / (RAND_MAX / (max - min + 1) + 1);
}
//try to choose kernel parameters to reduce useless threads
std::pair<int,int> kernel_dims(int problem_size){
// threads per block should ideally be a multiple of 32 because in CUDA , a scheduling unit is 32 threads (called a WARP) .
int threads_per_block = 256;
int num_blocks = (problem_size + threads_per_block - 1) / threads_per_block;
return {num_blocks,threads_per_block};
}
int main() {
int2 matAdims = make_int2(32,32);
int2 matBdims = make_int2(32,32);
int2 matANSdims = make_int2(matAdims.x,matBdims.y);
int *arrA, *arrB,*arrANS;
// Allocate Unified Memory Space
/* Unified Memory does not need explicit copying back and forth between Host and GPU memory ,
* See https://developer.nvidia.com/blog/unified-memory-cuda-beginners for more */
cudaMallocManaged(&arrA,matAdims.x*matAdims.y*sizeof(int));
cudaMallocManaged(&arrB,matBdims.x*matBdims.y*sizeof(int));
cudaMallocManaged(&arrANS,matANSdims.x*matANSdims.y*sizeof(int));
//initialize 2 matrices randomly
for (auto i =0;i<matAdims.x*matAdims.y;++i){
arrA[i] = rand_inrange(0,5);
}
for (auto i =0;i<matBdims.x*matBdims.y;++i){
arrB[i] = rand_inrange(0,5);
}
int N = matANSdims.x * matANSdims.y;
// Find number of threads and blocks and launch Kernel code
auto dims = kernel_dims(N);
multiply<<<dims.first,dims.second>>>(arrA,matAdims,arrB,matBdims,arrANS,matANSdims);
cudaDeviceSynchronize(); // Host does not proceed until kernel has finished
std::cout << "matA:\n";
pprint(arrA,matAdims);
std::cout << "matB:\n";
pprint(arrB,matBdims);
std::cout << "matAns:\n";
pprint(arrANS,matANSdims);
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment