Skip to content

Instantly share code, notes, and snippets.

@ekzhang
Created April 30, 2023 20:24
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ekzhang/32f5ad7123359f2a1e0b143250742211 to your computer and use it in GitHub Desktop.
Save ekzhang/32f5ad7123359f2a1e0b143250742211 to your computer and use it in GitHub Desktop.
Code from HSRG on matrix multiplication perf optimization — both are still incomplete
#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#define MAX_ERR 1e-4
#define CEIL_DIV(x, y) ((x) + (y) - 1) / (y)
#define checkCudaErrors(call) \
do { \
cudaError_t err = call; \
if (err != cudaSuccess) { \
printf("CUDA error at %s %d: %s\n", __FILE__, __LINE__, \
cudaGetErrorString(err)); \
exit(EXIT_FAILURE); \
} \
} while (0)
__global__ void sgemm_naive(int M, int N, int K, float alpha, const float *A,
const float *B, float beta, float *C) {
// compute position in C that this thread is responsible for
const uint x = blockIdx.x * blockDim.y + threadIdx.y;
const uint y = blockIdx.y * blockDim.x + threadIdx.x;
// `if` condition is necessary for when M or N aren't multiples of 32.
if (x < M && y < N) {
float tmp = 0.0;
for (int i = 0; i < K; ++i) {
tmp += A[x * K + i] * B[i * N + y];
}
// C = α*(A@B)+β*C
C[x * N + y] = alpha * tmp + beta * C[x * N + y];
}
}
int main() {
const int N = 4096;
const int M = 4096;
const int K = 4096;
const float alpha = 0.5;
const float beta = 0.7;
float *a, *b, *c;
float *d_a, *d_b, *d_c;
// Allocate memory
a = (float*)malloc(sizeof(float) * M * K);
b = (float*)malloc(sizeof(float) * K * N);
c = (float*)malloc(sizeof(float) * M * N);
checkCudaErrors(cudaMalloc((void**) &d_a, sizeof(float) * M * K));
checkCudaErrors(cudaMalloc((void**) &d_b, sizeof(float) * K * N));
checkCudaErrors(cudaMalloc((void**) &d_c, sizeof(float) * M * N));
for (int i = 0; i < M; i++) {
for (int j = 0; j < K; j++) {
a[i * K + j] = 1.0;
}
}
for (int i = 0; i < K; i++) {
for (int j = 0; j < N; j++) {
b[i * N + j] = 1.0;
}
}
for (int i = 0; i < M; i++) {
for (int j = 0; j < N; j++) {
c[i * N + j] = 1.0;
}
}
cudaMemcpy(d_a, a, sizeof(float) * M * K, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, b, sizeof(float) * K * N, cudaMemcpyHostToDevice);
cudaMemcpy(d_c, c, sizeof(float) * M * N, cudaMemcpyHostToDevice);
// create as many blocks as necessary to map all of C
dim3 gridDim(CEIL_DIV(M, 32), CEIL_DIV(N, 32), 1);
// 32 * 32 = 1024 thread per block
dim3 blockDim(32, 32, 1);
// launch the asynchronous execution of the kernel on the device
// The function call returns immediately on the host
sgemm_naive<<<gridDim, blockDim>>>(M, N, K, alpha, d_a, d_b, beta, d_c);
checkCudaErrors(cudaPeekAtLastError());
checkCudaErrors(cudaDeviceSynchronize());
cudaMemcpy(c, d_c, sizeof(float) * M * N, cudaMemcpyDeviceToHost);
checkCudaErrors(cudaFree(d_a));
checkCudaErrors(cudaFree(d_b));
checkCudaErrors(cudaFree(d_c));
// Verification
printf("Done!\n");
for(int i = 0; i < M * N; i++){
assert(fabs(c[i] - alpha * K - beta) < MAX_ERR);
}
printf("c[0] = %f\n", c[0]);
printf("PASSED\n");
}
use std::time::{Duration, Instant};
use rayon::prelude::*;
use wide::f32x8;
const N: usize = 1024;
const M: usize = 1024;
const K: usize = 1024;
fn matmul_naive(a: &[f32], b: &[f32], c: &mut [f32]) {
for i in 0..M {
for j in 0..N {
let mut temp = 0.0;
for k in 0..K {
temp += a[i * K + k] * b[k * N + j];
}
c[i * N + j] = temp;
}
}
}
fn matmul_threads(a: &[f32], b: &[f32], c: &mut [f32]) {
c.par_chunks_exact_mut(N)
.enumerate()
.for_each(|(i, c_row)| {
for j in 0..N {
let mut temp: f32 = 0.0;
for k in 0..K {
temp += a[i * K + k] * b[k * N + j];
}
c_row[j] = temp;
}
});
}
fn matmul_threads_transposed(a: &[f32], b: &[f32], c: &mut [f32]) {
let mut b_transposed = vec![0.0; b.len()];
for i in 0..K {
for j in 0..N {
b_transposed[j * K + i] = b[i * N + j];
}
}
c.par_chunks_exact_mut(N)
.enumerate()
.for_each(|(i, c_row)| {
for j in 0..N {
let mut temp: f32 = 0.0;
for k in 0..K {
temp += a[i * K + k] * b_transposed[j * K + k];
}
c_row[j] = temp;
}
});
}
fn matmul_threads_transposed_simd(a: &[f32], b: &[f32], c: &mut [f32]) {
let mut b_transposed = vec![0.0; b.len()];
for i in 0..K {
for j in 0..N {
b_transposed[j * K + i] = b[i * N + j];
}
}
c.par_chunks_exact_mut(N)
.enumerate()
.for_each(|(i, c_row)| {
for j in 0..N {
// a: [ ... < i*K .. i*K + K > ... ]
// b: [ ... < j*K .. j*K + K > ... ]
let chunks = K / 8;
let mut summand = f32x8::ZERO;
unsafe {
let a_ptr = a.as_ptr().add(i * K) as *const f32x8;
let b_ptr = b_transposed.as_ptr().add(j * K) as *const f32x8;
for chunk in 0..chunks {
summand += *a_ptr.add(chunk) * *b_ptr.add(chunk);
}
}
let mut total_sum: f32 = summand.as_array_ref().into_iter().sum();
for k in (chunks * 8)..K {
total_sum += a[i * K + k] * b_transposed[j * K + k];
}
c_row[j] = total_sum;
}
});
}
fn run_benchmark(iters: u32, f: impl Fn(&[f32], &[f32], &mut [f32])) {
let mut total_time = Duration::ZERO;
for _ in 0..iters {
let a = vec![1.0_f32; M * K];
let b = vec![1.0_f32; K * N];
let mut c = vec![1.0_f32; M * N];
let start = Instant::now();
f(&a, &b, &mut c);
total_time += start.elapsed();
assert!(c.iter().all(|&x| (x - K as f32).abs() < 1e-6));
}
let t = total_time.div_f64(iters as f64);
let ops = 2 * M * N * K;
let gflops = 1e-9 * ops as f64 / t.as_secs_f64();
println!(" .. took {t:?}");
println!(" .. gflops {gflops:.2}");
}
fn main() {
println!("Vanilla implementation");
run_benchmark(1, matmul_naive);
println!("Threaded implementation");
run_benchmark(8, matmul_threads);
println!("Threaded transposed implementation");
run_benchmark(20, matmul_threads_transposed);
println!("Threaded transposed simd implementation");
run_benchmark(50, matmul_threads_transposed_simd);
}
@ekzhang
Copy link
Author

ekzhang commented Apr 30, 2023

This was done on a shared g5g.4xlarge instance (16 Graviton2 ARM cores, 1x Nvidia T4g GPU). matmul2.cu is currently at Step 2 of How to Optimize a CUDA Matmul Kernel for cuBLAS-like Performance: a Worklog, and it achieves around 510 GFLOPs. simd_matmul.rs gradually implements a cache-aware, non-tiled CPU algorithm using several threads (rayon) and explicit portable SIMD operations (wide). It gets around 120 GFLOPs.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment