pchng/matmul_test.cu

## matmul_test.cu
#include <stdio.h>

#define A 3000
#define B 4000
#define C 3000

// Computes out = left @ right, where `@` is matrix muliplication
// Dimensions:
// left:  a x b
// right: b x c
// out:   a x c
// monolithic kernel: One thread per output element in matrix C.
// (Each thread computes the dot product between a row in `left` and a col in `right`)
__global__ void matMul(float *left, float *right, float *out, int a, int b, int c) {
  // Use y to index to rows, x to index to cols (just to match typical visualization)
  // row indexes into left, col indexes into right.
  int row = blockIdx.y * blockDim.y + threadIdx.y;
  int col = blockIdx.x * blockDim.x + threadIdx.x;

  if (row < a && col < c) {
    float sum = 0.0;
    // Each row of `left` and each col of `right` has `b` elements.
    for (int i = 0; i < b; i++) {
      // 1. If the row (indexed by threadIdx.y) is not changing, every thread reads the same row from `left`.
      // This will reduce to a single read for each iteration, and likely will be cached after the first read since
      // elements are being read consecutively on each iteration.
      // 2. If threadIdx.x is changing, then the 32 threads of the warp will read consecutive positions from
      // the same row in `right` on each iteration in coalesced fashion, so only one read per iteration across the warp.
      // This results in 32 cols being read across the entire loop.
      sum += left[row * b + i] * right[i * c + col];
    }
    // 3. The write is done in coalesced fashion if we assume `row` is not changing across each thread in the warp, but `col` is.
    // Then adjacent threads will write to adjacent elements in `out`.
    out[row * c + col] = sum;
  }
}

// Same as above, but row/col set to x/y instead.
__global__ void matMulBad(float *left, float *right, float *out, int a, int b, int c) {
  int row = blockIdx.x * blockDim.x + threadIdx.x;
  int col = blockIdx.y * blockDim.y + threadIdx.y;

  if (row < a && col < c) {
    float sum = 0.0;
    for (int i = 0; i < b; i++) {
      // 1. If row/threadIdx.x is changing within the warp, then on each iteration the threads do a strided access:
      // They will access elements separated by a stride of b. This results in non-coalesced accesses (multiple memory reads)
      // That is, we are reading across 32 rows in `left` one element at a time.
      // 2. If col/threadIdx.y is not changing within the warp, then each thread reads the same column.
      // This results in only one read per iteration, but likely doesn't take advantage of caching if each element in the col
      // is separated by a large stride.
      sum += left[row * b + i] * right[i * c + col];
    }
    // 3. Writes are not coalesced well if we assume `row` is changing across each thread in the warp.
    // Then we are writing out to elements separated by a stride of `c`.
    out[row * c + col] = sum;
  }
}

// Utility functions
void initMat(float *mat, int m, int n, int val) {
  int size = m * n;
  for (int i = 0; i < size; i++) {
    if (val == -1) {
      mat[i] = i+1;
    } else {
      mat[i] = val;
    }
  }
}

void printMat(const char *name, float *mat, int m, int n) {
  printf("%s: ", name);
  for (int i = 0; i < m; i++) {
    printf("[");
    for (int j = 0; j < n; j++) {
      // NOTE: Current place is i * NUM_COLS + CURRENT COL
      printf("%.1f", mat[i * n + j]);
      if (j < n - 1) {
        printf(", ");
      }
    }
    printf("]\n");
  }
  printf("\n");
}

// NOTE: This code has no error handling!
int main(int argc, char *argv[]) {
  // Simple "debug" flag.
  bool debug = argc > 1;

  // Host pointers
  float *left, *right, *out;
  // CUDA/Device pointers
  float *d_left, *d_right, *d_out;

  // Allocate memory on host
  // Matrix dimensions:
  // left:  a x b
  // right: b x c
  // out:   a x c
  left  = (float*)malloc(sizeof(float) * A * B);
  right = (float*)malloc(sizeof(float) * B * C);
  out   = (float*)malloc(sizeof(float) * A * C);

  // Initialize values
  initMat(left, A, B, -1);
  initMat(right, B, C, -1);

  if (debug) {
    printMat("left", left, A, B);
    printMat("right", right, B, C);
  }

  // Allocate device memory for left, right, out
  cudaMalloc((void**)&d_left,  sizeof(float) * A * B);
  cudaMalloc((void**)&d_right, sizeof(float) * B * C);
  cudaMalloc((void**)&d_out,   sizeof(float) * A * C);

  // Transfer data from host to device memory
  cudaMemcpy(d_left,  left,  sizeof(float) * A * B, cudaMemcpyHostToDevice);
  cudaMemcpy(d_right, right, sizeof(float) * B * C, cudaMemcpyHostToDevice);

  // 1. Calling `matMul` (good) kernel:
  // x dimension assigned to the columns of output matrix
  // y dimension is assigned to the rows of the output matrix
  dim3 threadsPerBlock(32, 32); // 1024 threads per block equally spread across x (col), y (row)
  // Need enough thread blocks in the grid to "cover" the output matrix.
  dim3 blocksPerGrid((C + 31)/32, (A + 31)/32);
  matMul<<<blocksPerGrid, threadsPerBlock>>>(d_left, d_right, d_out, A, B, C);
  // Transfer data from device to host memory (implicitly synchronizes/waits for CUDA kernel to complete)
  cudaMemcpy(out, d_out, sizeof(float) * A * C, cudaMemcpyDeviceToHost);

  // Simple correctness check: For A=3, B=4, C=3: SHOULD BE:
  // [70.0, 80.0, 90.0]
  // [158.0, 184.0, 210.0]
  // [246.0, 288.0, 330.0]
  if (debug) {
    printMat("out", out, A, C);
  }

  // 2. Calling `matMulBad`:
  // x dimension assigned to rows of output
  // y dimension assigned to cols of output
  dim3 blocksPerGridBad((A + 31)/32, (C + 31)/32);
  matMulBad<<<blocksPerGridBad, threadsPerBlock>>>(d_left, d_right, d_out, A, B, C);
  cudaMemcpy(out, d_out, sizeof(float) * A * C, cudaMemcpyDeviceToHost);

  if (debug) {
    printMat("out", out, A, C);
  }

  printf("A: %d, B: %d, C: %d", A, B, C);

  // Cleanup after kernel execution
  cudaFree(d_left); cudaFree(d_right); cudaFree(d_out);
  free(left); free(right); free(out);
}
	#include <stdio.h>

	#define A 3000
	#define B 4000
	#define C 3000

	// Computes out = left @ right, where `@` is matrix muliplication
	// Dimensions:
	// left: a x b
	// right: b x c
	// out: a x c
	// monolithic kernel: One thread per output element in matrix C.
	// (Each thread computes the dot product between a row in `left` and a col in `right`)
	__global__ void matMul(float left, float right, float *out, int a, int b, int c) {
	// Use y to index to rows, x to index to cols (just to match typical visualization)
	// row indexes into left, col indexes into right.
	int row = blockIdx.y * blockDim.y + threadIdx.y;
	int col = blockIdx.x * blockDim.x + threadIdx.x;

	if (row < a && col < c) {
	float sum = 0.0;
	// Each row of `left` and each col of `right` has `b` elements.
	for (int i = 0; i < b; i++) {
	// 1. If the row (indexed by threadIdx.y) is not changing, every thread reads the same row from `left`.
	// This will reduce to a single read for each iteration, and likely will be cached after the first read since
	// elements are being read consecutively on each iteration.
	// 2. If threadIdx.x is changing, then the 32 threads of the warp will read consecutive positions from
	// the same row in `right` on each iteration in coalesced fashion, so only one read per iteration across the warp.
	// This results in 32 cols being read across the entire loop.
	sum += left[row * b + i] * right[i * c + col];
	}
	// 3. The write is done in coalesced fashion if we assume `row` is not changing across each thread in the warp, but `col` is.
	// Then adjacent threads will write to adjacent elements in `out`.
	out[row * c + col] = sum;
	}
	}

	// Same as above, but row/col set to x/y instead.
	__global__ void matMulBad(float left, float right, float *out, int a, int b, int c) {
	int row = blockIdx.x * blockDim.x + threadIdx.x;
	int col = blockIdx.y * blockDim.y + threadIdx.y;

	if (row < a && col < c) {
	float sum = 0.0;
	for (int i = 0; i < b; i++) {
	// 1. If row/threadIdx.x is changing within the warp, then on each iteration the threads do a strided access:
	// They will access elements separated by a stride of b. This results in non-coalesced accesses (multiple memory reads)
	// That is, we are reading across 32 rows in `left` one element at a time.
	// 2. If col/threadIdx.y is not changing within the warp, then each thread reads the same column.
	// This results in only one read per iteration, but likely doesn't take advantage of caching if each element in the col
	// is separated by a large stride.
	sum += left[row * b + i] * right[i * c + col];
	}
	// 3. Writes are not coalesced well if we assume `row` is changing across each thread in the warp.
	// Then we are writing out to elements separated by a stride of `c`.
	out[row * c + col] = sum;
	}
	}

	// Utility functions
	void initMat(float *mat, int m, int n, int val) {
	int size = m * n;
	for (int i = 0; i < size; i++) {
	if (val == -1) {
	mat[i] = i+1;
	} else {
	mat[i] = val;
	}
	}
	}

	void printMat(const char name, float mat, int m, int n) {
	printf("%s: ", name);
	for (int i = 0; i < m; i++) {
	printf("[");
	for (int j = 0; j < n; j++) {
	// NOTE: Current place is i * NUM_COLS + CURRENT COL
	printf("%.1f", mat[i * n + j]);
	if (j < n - 1) {
	printf(", ");
	}
	}
	printf("]\n");
	}
	printf("\n");
	}

	// NOTE: This code has no error handling!
	int main(int argc, char *argv[]) {
	// Simple "debug" flag.
	bool debug = argc > 1;

	// Host pointers
	float left, right, *out;
	// CUDA/Device pointers
	float d_left, d_right, *d_out;

	// Allocate memory on host
	// Matrix dimensions:
	// left: a x b
	// right: b x c
	// out: a x c
	left = (float)malloc(sizeof(float) A * B);
	right = (float)malloc(sizeof(float) B * C);
	out = (float)malloc(sizeof(float) A * C);

	// Initialize values
	initMat(left, A, B, -1);
	initMat(right, B, C, -1);

	if (debug) {
	printMat("left", left, A, B);
	printMat("right", right, B, C);
	}

	// Allocate device memory for left, right, out
	cudaMalloc((void*)&d_left, sizeof(float) A * B);
	cudaMalloc((void*)&d_right, sizeof(float) B * C);
	cudaMalloc((void*)&d_out, sizeof(float) A * C);

	// Transfer data from host to device memory
	cudaMemcpy(d_left, left, sizeof(float) * A * B, cudaMemcpyHostToDevice);
	cudaMemcpy(d_right, right, sizeof(float) * B * C, cudaMemcpyHostToDevice);

	// 1. Calling `matMul` (good) kernel:
	// x dimension assigned to the columns of output matrix
	// y dimension is assigned to the rows of the output matrix
	dim3 threadsPerBlock(32, 32); // 1024 threads per block equally spread across x (col), y (row)
	// Need enough thread blocks in the grid to "cover" the output matrix.
	dim3 blocksPerGrid((C + 31)/32, (A + 31)/32);
	matMul<<<blocksPerGrid, threadsPerBlock>>>(d_left, d_right, d_out, A, B, C);
	// Transfer data from device to host memory (implicitly synchronizes/waits for CUDA kernel to complete)
	cudaMemcpy(out, d_out, sizeof(float) * A * C, cudaMemcpyDeviceToHost);

	// Simple correctness check: For A=3, B=4, C=3: SHOULD BE:
	// [70.0, 80.0, 90.0]
	// [158.0, 184.0, 210.0]
	// [246.0, 288.0, 330.0]
	if (debug) {
	printMat("out", out, A, C);
	}

	// 2. Calling `matMulBad`:
	// x dimension assigned to rows of output
	// y dimension assigned to cols of output
	dim3 blocksPerGridBad((A + 31)/32, (C + 31)/32);
	matMulBad<<<blocksPerGridBad, threadsPerBlock>>>(d_left, d_right, d_out, A, B, C);
	cudaMemcpy(out, d_out, sizeof(float) * A * C, cudaMemcpyDeviceToHost);

	if (debug) {
	printMat("out", out, A, C);
	}

	printf("A: %d, B: %d, C: %d", A, B, C);

	// Cleanup after kernel execution
	cudaFree(d_left); cudaFree(d_right); cudaFree(d_out);
	free(left); free(right); free(out);
	}