Skip to content

Instantly share code, notes, and snippets.

@karino2
Last active Jan 1, 2018
Embed
What would you like to do?
Matrix Multiplication for CUDA explanation
#include "util.h"
#define TILE_WIDTH 16
// Compute C = A * B
__global__ void matrixMultiplyShared(float* A, float* B, float* C,
int numARows, int numACols,
int numBRows, int numBCols,
int numCRows, int numCCols)
{
__shared__ float ds_A [TILE_WIDTH][TILE_WIDTH];
__shared__ float ds_B [TILE_WIDTH][TILE_WIDTH];
int row = blockIdx.x * blockDim.x + threadIdx.x;
int col = blockIdx.y * blockDim.y + threadIdx.y;
int sizeA = numARows * numACols;
int sizeB = numBRows * numBCols;
int numTiles = (numACols - 1)/ TILE_WIDTH + 1;
float cEntry = 0.0;
for (int tile = 0; tile < numTiles; tile++)
{
int posA = row * numACols + tile * TILE_WIDTH + threadIdx.y;
if (posA < sizeA)
ds_A[threadIdx.x][threadIdx.y] = A[posA];
else
ds_A[threadIdx.x][threadIdx.y] = 0.0;
int posB = (tile * TILE_WIDTH + threadIdx.x) * numBCols + col;
if (posB < sizeB)
ds_B[threadIdx.x][threadIdx.y] = B[posB];
else
ds_B[threadIdx.x][threadIdx.y] = 0.0;
__syncthreads();
for (int i = 0; i < TILE_WIDTH; i++)
cEntry += ds_A[threadIdx.x][i] * ds_B[i][threadIdx.y];
__syncthreads();
}
if (row < numARows && col < numBCols)
C[row * numCCols + col] = cEntry;
}
int main(int argc, char ** argv) {
wbArg_t args;
float * hostA; // The A matrix
float * hostB; // The B matrix
float * hostC; // The output C matrix
float * deviceA;
float * deviceB;
float * deviceC;
int numARows;
int numAColumns;
int numBRows;
int numBColumns;
int numCRows;
int numCColumns;
hostA = (float *) util.load_first_matrix(&numARows, &numAColumns);
hostB = (float *) util.load_second_matrix(&numBRows, &numBColumns);
numCRows = numARows;
numCColumns = numBColumns;
hostC = (float *) malloc(numCRows * numCColumns * sizeof(float));
cudaMalloc((void **)&deviceA, numARows * numAColumns * sizeof(float));
cudaMalloc((void **)&deviceB, numBRows * numBColumns * sizeof(float));
cudaMalloc((void **)&deviceC, numCRows * numCColumns * sizeof(float));
cudaMemcpy(deviceA, hostA, numARows * numAColumns * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(deviceB, hostB, numBRows * numBColumns * sizeof(float), cudaMemcpyHostToDevice);
int blockSize = 16;
dim3 dimGrid((numCColumns - 1) / blockSize + 1, (numCRows - 1) / blockSize + 1);
dim3 dimBlock(blockSize, blockSize);
matrixMultiply<<<dimGrid,dimBlock>>>(deviceA, deviceB, deviceC,
numARows, numAColumns,
numBRows, numBColumns,
numCRows, numCColumns);
cudaThreadSynchronize();
cudaMemcpy(hostC, deviceC, numCRows * numCColumns * sizeof(float), cudaMemcpyDeviceToHost);
cudaFree(deviceA);
cudaFree(deviceB);
cudaFree(deviceC);
util.assert_matmul_result(hostC, numCRows, numCColumns);
free(hostA);
free(hostB);
free(hostC);
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment