omarsamir27/cuda_matrix_multiply.cu

## cuda_matrix_multiply.cu
#include <iostream>

/* Example code to demonstrate matrix multiplication of any appropriate dimensions using CUDA
 * Matrix Multiplication is a routine operation in any BLAS Library , see https://docs.nvidia.com/cuda/cublas/index.html
 * Before working with CUDA , it is advisable to go over the CUDA Documentation :
 * https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html
 * Author : Omar Mohamed <omarmsamir27@gmail.com> <github.com/omarsamir27>
 * for HPC Centre in Alexandria Library <https://hpc.bibalex.org> <github.com/hpcalex>
 */

__global__
void multiply(int* A,int2 Adims,int* B,int2 Bdims,int *ANS,int2 ANSdims){
    int idx = blockIdx.x * blockDim.x + threadIdx.x ; // Thread ID in all blocks
    if (idx >= ANSdims.x * ANSdims.y ) return; // prevent extra threads from touching memory
    /* Convert linear indices to matrix subscripts and multiply matrices, each thread is responsible for the
     * element whose index matches its Thread ID*/
    unsigned int row = idx / ANSdims.x;
    unsigned int col = idx % ANSdims.y;
    unsigned int N = ANSdims.x * ANSdims.y;
    int sum = 0;
    for(int i =0;i<N;++i){
        sum += A[row*Adims.y + i] * B[i*Bdims.y + col];
    }
    ANS[row*ANSdims.y + col] = sum;
}

//pretty print in matrix form
void pprint(int* mat,int2 dims){
    for (auto i=0;i<dims.x;++i){
        for (auto j=0;j<dims.y;++j){
            std::cout << mat[i*dims.y + j] << '\t' ;
        }
        std::cout << std::endl;
    }
    std::cout << std::endl;
}


inline int rand_inrange(int min,int max){
    return min + rand() / (RAND_MAX / (max - min + 1) + 1);
}

//try to choose kernel parameters to reduce useless threads
std::pair<int,int> kernel_dims(int problem_size){
    // threads per block should ideally be a multiple of 32 because in CUDA , a scheduling unit is 32 threads (called a WARP) .
    int threads_per_block = 256;
    int num_blocks = (problem_size + threads_per_block - 1) / threads_per_block;
    return {num_blocks,threads_per_block};
}

int main() {
    int2 matAdims = make_int2(32,32);
    int2 matBdims = make_int2(32,32);
    int2 matANSdims = make_int2(matAdims.x,matBdims.y);
    int *arrA, *arrB,*arrANS;

    // Allocate Unified Memory Space
    /* Unified Memory does not need explicit copying back and forth between Host and GPU memory ,
     * See https://developer.nvidia.com/blog/unified-memory-cuda-beginners for more */
    cudaMallocManaged(&arrA,matAdims.x*matAdims.y*sizeof(int));
    cudaMallocManaged(&arrB,matBdims.x*matBdims.y*sizeof(int));
    cudaMallocManaged(&arrANS,matANSdims.x*matANSdims.y*sizeof(int));

    //initialize 2 matrices randomly
    for (auto i =0;i<matAdims.x*matAdims.y;++i){
        arrA[i] = rand_inrange(0,5);
    }
    for (auto i =0;i<matBdims.x*matBdims.y;++i){
        arrB[i] = rand_inrange(0,5);
    }

    int N = matANSdims.x * matANSdims.y;

    // Find number of threads and blocks and launch Kernel code
    auto dims = kernel_dims(N);
    multiply<<<dims.first,dims.second>>>(arrA,matAdims,arrB,matBdims,arrANS,matANSdims);
    cudaDeviceSynchronize(); // Host does not proceed until kernel has finished


    std::cout << "matA:\n";
    pprint(arrA,matAdims);
    std::cout << "matB:\n";
    pprint(arrB,matBdims);
    std::cout << "matAns:\n";
    pprint(arrANS,matANSdims);
    return 0;
}
	#include <iostream>

	/* Example code to demonstrate matrix multiplication of any appropriate dimensions using CUDA
	* Matrix Multiplication is a routine operation in any BLAS Library , see https://docs.nvidia.com/cuda/cublas/index.html
	* Before working with CUDA , it is advisable to go over the CUDA Documentation :
	* https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html
	* Author : Omar Mohamed <omarmsamir27@gmail.com> <github.com/omarsamir27>
	* for HPC Centre in Alexandria Library <https://hpc.bibalex.org> <github.com/hpcalex>
	*/

	__global__
	void multiply(int* A,int2 Adims,int* B,int2 Bdims,int *ANS,int2 ANSdims){
	int idx = blockIdx.x * blockDim.x + threadIdx.x ; // Thread ID in all blocks
	if (idx >= ANSdims.x * ANSdims.y ) return; // prevent extra threads from touching memory
	/* Convert linear indices to matrix subscripts and multiply matrices, each thread is responsible for the
	* element whose index matches its Thread ID*/
	unsigned int row = idx / ANSdims.x;
	unsigned int col = idx % ANSdims.y;
	unsigned int N = ANSdims.x * ANSdims.y;
	int sum = 0;
	for(int i =0;i<N;++i){
	sum += A[rowAdims.y + i] B[i*Bdims.y + col];
	}
	ANS[row*ANSdims.y + col] = sum;
	}

	//pretty print in matrix form
	void pprint(int* mat,int2 dims){
	for (auto i=0;i<dims.x;++i){
	for (auto j=0;j<dims.y;++j){
	std::cout << mat[i*dims.y + j] << '\t' ;
	}
	std::cout << std::endl;
	}
	std::cout << std::endl;
	}


	inline int rand_inrange(int min,int max){
	return min + rand() / (RAND_MAX / (max - min + 1) + 1);
	}

	//try to choose kernel parameters to reduce useless threads
	std::pair<int,int> kernel_dims(int problem_size){
	// threads per block should ideally be a multiple of 32 because in CUDA , a scheduling unit is 32 threads (called a WARP) .
	int threads_per_block = 256;
	int num_blocks = (problem_size + threads_per_block - 1) / threads_per_block;
	return {num_blocks,threads_per_block};
	}

	int main() {
	int2 matAdims = make_int2(32,32);
	int2 matBdims = make_int2(32,32);
	int2 matANSdims = make_int2(matAdims.x,matBdims.y);
	int arrA, arrB,*arrANS;

	// Allocate Unified Memory Space
	/* Unified Memory does not need explicit copying back and forth between Host and GPU memory ,
	* See https://developer.nvidia.com/blog/unified-memory-cuda-beginners for more */
	cudaMallocManaged(&arrA,matAdims.xmatAdims.ysizeof(int));
	cudaMallocManaged(&arrB,matBdims.xmatBdims.ysizeof(int));
	cudaMallocManaged(&arrANS,matANSdims.xmatANSdims.ysizeof(int));

	//initialize 2 matrices randomly
	for (auto i =0;i<matAdims.x*matAdims.y;++i){
	arrA[i] = rand_inrange(0,5);
	}
	for (auto i =0;i<matBdims.x*matBdims.y;++i){
	arrB[i] = rand_inrange(0,5);
	}

	int N = matANSdims.x * matANSdims.y;

	// Find number of threads and blocks and launch Kernel code
	auto dims = kernel_dims(N);
	multiply<<<dims.first,dims.second>>>(arrA,matAdims,arrB,matBdims,arrANS,matANSdims);
	cudaDeviceSynchronize(); // Host does not proceed until kernel has finished


	std::cout << "matA:\n";
	pprint(arrA,matAdims);
	std::cout << "matB:\n";
	pprint(arrB,matBdims);
	std::cout << "matAns:\n";
	pprint(arrANS,matANSdims);
	return 0;
	}