Created
March 6, 2024 19:38
-
-
Save pchng/5a41c99aa9c0efd8d1504f3b2bc374e0 to your computer and use it in GitHub Desktop.
CUDA matrix multiplication: Warp-thread linearization test
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#define A 3000 | |
#define B 4000 | |
#define C 3000 | |
// Computes out = left @ right, where `@` is matrix muliplication | |
// Dimensions: | |
// left: a x b | |
// right: b x c | |
// out: a x c | |
// monolithic kernel: One thread per output element in matrix C. | |
// (Each thread computes the dot product between a row in `left` and a col in `right`) | |
__global__ void matMul(float *left, float *right, float *out, int a, int b, int c) { | |
// Use y to index to rows, x to index to cols (just to match typical visualization) | |
// row indexes into left, col indexes into right. | |
int row = blockIdx.y * blockDim.y + threadIdx.y; | |
int col = blockIdx.x * blockDim.x + threadIdx.x; | |
if (row < a && col < c) { | |
float sum = 0.0; | |
// Each row of `left` and each col of `right` has `b` elements. | |
for (int i = 0; i < b; i++) { | |
// 1. If the row (indexed by threadIdx.y) is not changing, every thread reads the same row from `left`. | |
// This will reduce to a single read for each iteration, and likely will be cached after the first read since | |
// elements are being read consecutively on each iteration. | |
// 2. If threadIdx.x is changing, then the 32 threads of the warp will read consecutive positions from | |
// the same row in `right` on each iteration in coalesced fashion, so only one read per iteration across the warp. | |
// This results in 32 cols being read across the entire loop. | |
sum += left[row * b + i] * right[i * c + col]; | |
} | |
// 3. The write is done in coalesced fashion if we assume `row` is not changing across each thread in the warp, but `col` is. | |
// Then adjacent threads will write to adjacent elements in `out`. | |
out[row * c + col] = sum; | |
} | |
} | |
// Same as above, but row/col set to x/y instead. | |
__global__ void matMulBad(float *left, float *right, float *out, int a, int b, int c) { | |
int row = blockIdx.x * blockDim.x + threadIdx.x; | |
int col = blockIdx.y * blockDim.y + threadIdx.y; | |
if (row < a && col < c) { | |
float sum = 0.0; | |
for (int i = 0; i < b; i++) { | |
// 1. If row/threadIdx.x is changing within the warp, then on each iteration the threads do a strided access: | |
// They will access elements separated by a stride of b. This results in non-coalesced accesses (multiple memory reads) | |
// That is, we are reading across 32 rows in `left` one element at a time. | |
// 2. If col/threadIdx.y is not changing within the warp, then each thread reads the same column. | |
// This results in only one read per iteration, but likely doesn't take advantage of caching if each element in the col | |
// is separated by a large stride. | |
sum += left[row * b + i] * right[i * c + col]; | |
} | |
// 3. Writes are not coalesced well if we assume `row` is changing across each thread in the warp. | |
// Then we are writing out to elements separated by a stride of `c`. | |
out[row * c + col] = sum; | |
} | |
} | |
// Utility functions | |
void initMat(float *mat, int m, int n, int val) { | |
int size = m * n; | |
for (int i = 0; i < size; i++) { | |
if (val == -1) { | |
mat[i] = i+1; | |
} else { | |
mat[i] = val; | |
} | |
} | |
} | |
void printMat(const char *name, float *mat, int m, int n) { | |
printf("%s: ", name); | |
for (int i = 0; i < m; i++) { | |
printf("["); | |
for (int j = 0; j < n; j++) { | |
// NOTE: Current place is i * NUM_COLS + CURRENT COL | |
printf("%.1f", mat[i * n + j]); | |
if (j < n - 1) { | |
printf(", "); | |
} | |
} | |
printf("]\n"); | |
} | |
printf("\n"); | |
} | |
// NOTE: This code has no error handling! | |
int main(int argc, char *argv[]) { | |
// Simple "debug" flag. | |
bool debug = argc > 1; | |
// Host pointers | |
float *left, *right, *out; | |
// CUDA/Device pointers | |
float *d_left, *d_right, *d_out; | |
// Allocate memory on host | |
// Matrix dimensions: | |
// left: a x b | |
// right: b x c | |
// out: a x c | |
left = (float*)malloc(sizeof(float) * A * B); | |
right = (float*)malloc(sizeof(float) * B * C); | |
out = (float*)malloc(sizeof(float) * A * C); | |
// Initialize values | |
initMat(left, A, B, -1); | |
initMat(right, B, C, -1); | |
if (debug) { | |
printMat("left", left, A, B); | |
printMat("right", right, B, C); | |
} | |
// Allocate device memory for left, right, out | |
cudaMalloc((void**)&d_left, sizeof(float) * A * B); | |
cudaMalloc((void**)&d_right, sizeof(float) * B * C); | |
cudaMalloc((void**)&d_out, sizeof(float) * A * C); | |
// Transfer data from host to device memory | |
cudaMemcpy(d_left, left, sizeof(float) * A * B, cudaMemcpyHostToDevice); | |
cudaMemcpy(d_right, right, sizeof(float) * B * C, cudaMemcpyHostToDevice); | |
// 1. Calling `matMul` (good) kernel: | |
// x dimension assigned to the columns of output matrix | |
// y dimension is assigned to the rows of the output matrix | |
dim3 threadsPerBlock(32, 32); // 1024 threads per block equally spread across x (col), y (row) | |
// Need enough thread blocks in the grid to "cover" the output matrix. | |
dim3 blocksPerGrid((C + 31)/32, (A + 31)/32); | |
matMul<<<blocksPerGrid, threadsPerBlock>>>(d_left, d_right, d_out, A, B, C); | |
// Transfer data from device to host memory (implicitly synchronizes/waits for CUDA kernel to complete) | |
cudaMemcpy(out, d_out, sizeof(float) * A * C, cudaMemcpyDeviceToHost); | |
// Simple correctness check: For A=3, B=4, C=3: SHOULD BE: | |
// [70.0, 80.0, 90.0] | |
// [158.0, 184.0, 210.0] | |
// [246.0, 288.0, 330.0] | |
if (debug) { | |
printMat("out", out, A, C); | |
} | |
// 2. Calling `matMulBad`: | |
// x dimension assigned to rows of output | |
// y dimension assigned to cols of output | |
dim3 blocksPerGridBad((A + 31)/32, (C + 31)/32); | |
matMulBad<<<blocksPerGridBad, threadsPerBlock>>>(d_left, d_right, d_out, A, B, C); | |
cudaMemcpy(out, d_out, sizeof(float) * A * C, cudaMemcpyDeviceToHost); | |
if (debug) { | |
printMat("out", out, A, C); | |
} | |
printf("A: %d, B: %d, C: %d", A, B, C); | |
// Cleanup after kernel execution | |
cudaFree(d_left); cudaFree(d_right); cudaFree(d_out); | |
free(left); free(right); free(out); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment