aleozlx/__blog_cutlass_matmul.py

## __blog_cutlass_matmul.py
C = np.zeros((3, 4), dtype=int)
block_size = (3, 2)
div_up = lambda a, b: (a + b - 1) // b

### CUDA Grid
for m in range(0, C.shape[0], block_size[0]):
    for n in range(0, C.shape[1], block_size[1]):

### Main loop in the CUDA kernel
### Smaller K is favorable to satisfy the shared memory bandwidth
        for k in range(A.shape[1]):

### The 3x2 block is what hypothetically fits in the shared memory.
### The thread block will contain multiple warps.
            row_range = slice(m, m+block_size[0])
            col_range = slice(n, n+block_size[1])

### Each warp loads a "fragment" into the register file
### from the shared memory. Here we didn't show warp-level
### decomposition, but it's essentially an unrolled loop
### that further divides the {row,col}_range
            frag_A = A[row_range, k]
            frag_B = B[k, col_range]

### Warp threads cooperatively compute the outer product.
### This can be done using a warp-level primitive "WMMA" since CUDA 9.
### nvcuda::wmma can be used to target the CUDA Tensor Cores
            C[row_range, col_range] += np.outer(frag_A, frag_B)
        print(f"C(t={n//block_size[1]}) =\n", C)

"""
OUTPUT
=============
C(t=0) =
 [[42 29  0  0]
 [22 14  0  0]
 [11  9  0  0]]
C(t=1) =
 [[42 29 26 21]
 [22 14 16 10]
 [11  9  4  5]]
"""
	C = np.zeros((3, 4), dtype=int)
	block_size = (3, 2)
	div_up = lambda a, b: (a + b - 1) // b

	### CUDA Grid
	for m in range(0, C.shape[0], block_size[0]):
	for n in range(0, C.shape[1], block_size[1]):

	### Main loop in the CUDA kernel
	### Smaller K is favorable to satisfy the shared memory bandwidth
	for k in range(A.shape[1]):

	### The 3x2 block is what hypothetically fits in the shared memory.
	### The thread block will contain multiple warps.
	row_range = slice(m, m+block_size[0])
	col_range = slice(n, n+block_size[1])

	### Each warp loads a "fragment" into the register file
	### from the shared memory. Here we didn't show warp-level
	### decomposition, but it's essentially an unrolled loop
	### that further divides the {row,col}_range
	frag_A = A[row_range, k]
	frag_B = B[k, col_range]

	### Warp threads cooperatively compute the outer product.
	### This can be done using a warp-level primitive "WMMA" since CUDA 9.
	### nvcuda::wmma can be used to target the CUDA Tensor Cores
	C[row_range, col_range] += np.outer(frag_A, frag_B)
	print(f"C(t={n//block_size[1]}) =\n", C)

	"""
	OUTPUT
	=============
	C(t=0) =
	[[42 29 0 0]
	[22 14 0 0]
	[11 9 0 0]]
	C(t=1) =
	[[42 29 26 21]
	[22 14 16 10]
	[11 9 4 5]]
	"""