Created
March 6, 2024 19:44
-
-
Save pchng/d91bf46a250cca238ab090d058a52346 to your computer and use it in GitHub Desktop.
CUDA: matmul bad
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Same as above, but row/col set to x/y instead. | |
__global__ void matMulBad(float *left, float *right, float *out, int a, int b, int c) { | |
int row = blockIdx.x * blockDim.x + threadIdx.x; | |
int col = blockIdx.y * blockDim.y + threadIdx.y; | |
if (row < a && col < c) { | |
float sum = 0.0; | |
for (int i = 0; i < b; i++) { | |
// 1. If row/threadIdx.x is changing within the warp, then on each iteration the threads do a strided access: | |
// They will access elements separated by a stride of b. This results in non-coalesced accesses (multiple memory reads) | |
// That is, we are reading across 32 rows in `left` one element at a time. | |
// 2. If col/threadIdx.y is not changing within the warp, then each thread reads the same column. | |
// This results in only one read per iteration, but likely doesn't take advantage of caching if each element in the col | |
// is separated by a large stride. | |
sum += left[row * b + i] * right[i * c + col]; | |
} | |
// 3. Writes are not coalesced well if we assume `row` is changing across each thread in the warp. | |
// Then we are writing out to elements separated by a stride of `c`. | |
out[row * c + col] = sum; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment