Last active
January 1, 2018 13:24
-
-
Save karino2/66b3f5e8fc01d6d0ffc44d3bbdf10bca to your computer and use it in GitHub Desktop.
Matrix Multiplication for CUDA explanation
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include "util.h" | |
#define TILE_WIDTH 16 | |
// Compute C = A * B | |
__global__ void matrixMultiplyShared(float* A, float* B, float* C, | |
int numARows, int numACols, | |
int numBRows, int numBCols, | |
int numCRows, int numCCols) | |
{ | |
__shared__ float ds_A [TILE_WIDTH][TILE_WIDTH]; | |
__shared__ float ds_B [TILE_WIDTH][TILE_WIDTH]; | |
int row = blockIdx.x * blockDim.x + threadIdx.x; | |
int col = blockIdx.y * blockDim.y + threadIdx.y; | |
int sizeA = numARows * numACols; | |
int sizeB = numBRows * numBCols; | |
int numTiles = (numACols - 1)/ TILE_WIDTH + 1; | |
float cEntry = 0.0; | |
for (int tile = 0; tile < numTiles; tile++) | |
{ | |
int posA = row * numACols + tile * TILE_WIDTH + threadIdx.y; | |
if (posA < sizeA) | |
ds_A[threadIdx.x][threadIdx.y] = A[posA]; | |
else | |
ds_A[threadIdx.x][threadIdx.y] = 0.0; | |
int posB = (tile * TILE_WIDTH + threadIdx.x) * numBCols + col; | |
if (posB < sizeB) | |
ds_B[threadIdx.x][threadIdx.y] = B[posB]; | |
else | |
ds_B[threadIdx.x][threadIdx.y] = 0.0; | |
__syncthreads(); | |
for (int i = 0; i < TILE_WIDTH; i++) | |
cEntry += ds_A[threadIdx.x][i] * ds_B[i][threadIdx.y]; | |
__syncthreads(); | |
} | |
if (row < numARows && col < numBCols) | |
C[row * numCCols + col] = cEntry; | |
} | |
int main(int argc, char ** argv) { | |
wbArg_t args; | |
float * hostA; // The A matrix | |
float * hostB; // The B matrix | |
float * hostC; // The output C matrix | |
float * deviceA; | |
float * deviceB; | |
float * deviceC; | |
int numARows; | |
int numAColumns; | |
int numBRows; | |
int numBColumns; | |
int numCRows; | |
int numCColumns; | |
hostA = (float *) util.load_first_matrix(&numARows, &numAColumns); | |
hostB = (float *) util.load_second_matrix(&numBRows, &numBColumns); | |
numCRows = numARows; | |
numCColumns = numBColumns; | |
hostC = (float *) malloc(numCRows * numCColumns * sizeof(float)); | |
cudaMalloc((void **)&deviceA, numARows * numAColumns * sizeof(float)); | |
cudaMalloc((void **)&deviceB, numBRows * numBColumns * sizeof(float)); | |
cudaMalloc((void **)&deviceC, numCRows * numCColumns * sizeof(float)); | |
cudaMemcpy(deviceA, hostA, numARows * numAColumns * sizeof(float), cudaMemcpyHostToDevice); | |
cudaMemcpy(deviceB, hostB, numBRows * numBColumns * sizeof(float), cudaMemcpyHostToDevice); | |
int blockSize = 16; | |
dim3 dimGrid((numCColumns - 1) / blockSize + 1, (numCRows - 1) / blockSize + 1); | |
dim3 dimBlock(blockSize, blockSize); | |
matrixMultiply<<<dimGrid,dimBlock>>>(deviceA, deviceB, deviceC, | |
numARows, numAColumns, | |
numBRows, numBColumns, | |
numCRows, numCColumns); | |
cudaThreadSynchronize(); | |
cudaMemcpy(hostC, deviceC, numCRows * numCColumns * sizeof(float), cudaMemcpyDeviceToHost); | |
cudaFree(deviceA); | |
cudaFree(deviceB); | |
cudaFree(deviceC); | |
util.assert_matmul_result(hostC, numCRows, numCColumns); | |
free(hostA); | |
free(hostB); | |
free(hostC); | |
return 0; | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment