pchng/matmul_bad.cu

## matmul_bad.cu
// Same as above, but row/col set to x/y instead.
__global__ void matMulBad(float *left, float *right, float *out, int a, int b, int c) {
  int row = blockIdx.x * blockDim.x + threadIdx.x;
  int col = blockIdx.y * blockDim.y + threadIdx.y;

  if (row < a && col < c) {
    float sum = 0.0;
    for (int i = 0; i < b; i++) {
      // 1. If row/threadIdx.x is changing within the warp, then on each iteration the threads do a strided access:
      // They will access elements separated by a stride of b. This results in non-coalesced accesses (multiple memory reads)
      // That is, we are reading across 32 rows in `left` one element at a time.
      // 2. If col/threadIdx.y is not changing within the warp, then each thread reads the same column.
      // This results in only one read per iteration, but likely doesn't take advantage of caching if each element in the col
      // is separated by a large stride.
      sum += left[row * b + i] * right[i * c + col];
    }
    // 3. Writes are not coalesced well if we assume `row` is changing across each thread in the warp.
    // Then we are writing out to elements separated by a stride of `c`.
    out[row * c + col] = sum;
  }
}
	// Same as above, but row/col set to x/y instead.
	__global__ void matMulBad(float left, float right, float *out, int a, int b, int c) {
	int row = blockIdx.x * blockDim.x + threadIdx.x;
	int col = blockIdx.y * blockDim.y + threadIdx.y;

	if (row < a && col < c) {
	float sum = 0.0;
	for (int i = 0; i < b; i++) {
	// 1. If row/threadIdx.x is changing within the warp, then on each iteration the threads do a strided access:
	// They will access elements separated by a stride of b. This results in non-coalesced accesses (multiple memory reads)
	// That is, we are reading across 32 rows in `left` one element at a time.
	// 2. If col/threadIdx.y is not changing within the warp, then each thread reads the same column.
	// This results in only one read per iteration, but likely doesn't take advantage of caching if each element in the col
	// is separated by a large stride.
	sum += left[row * b + i] * right[i * c + col];
	}
	// 3. Writes are not coalesced well if we assume `row` is changing across each thread in the warp.
	// Then we are writing out to elements separated by a stride of `c`.
	out[row * c + col] = sum;
	}
	}