ekzhang/matmul2.cu

## matmul2.cu
#include <assert.h>
#include <stdio.h>
#include <stdlib.h>

#define MAX_ERR 1e-4

#define CEIL_DIV(x, y) ((x) + (y) - 1) / (y)
#define checkCudaErrors(call)                                 \
  do {                                                        \
    cudaError_t err = call;                                   \
    if (err != cudaSuccess) {                                 \
      printf("CUDA error at %s %d: %s\n", __FILE__, __LINE__, \
             cudaGetErrorString(err));                        \
      exit(EXIT_FAILURE);                                     \
    }                                                         \
  } while (0)

__global__ void sgemm_naive(int M, int N, int K, float alpha, const float *A,
                            const float *B, float beta, float *C) {
    // compute position in C that this thread is responsible for
    const uint x = blockIdx.x * blockDim.y + threadIdx.y;
    const uint y = blockIdx.y * blockDim.x + threadIdx.x;

    // `if` condition is necessary for when M or N aren't multiples of 32.
    if (x < M && y < N) {
        float tmp = 0.0;
        for (int i = 0; i < K; ++i) {
            tmp += A[x * K + i] * B[i * N + y];
        }
        // C = α*(A@B)+β*C
        C[x * N + y] = alpha * tmp + beta * C[x * N + y];
    }
}

int main() {
    const int N = 4096;
    const int M = 4096;
    const int K = 4096;

    const float alpha = 0.5;
    const float beta = 0.7;

    float *a, *b, *c;
    float *d_a, *d_b, *d_c;

    // Allocate memory
    a = (float*)malloc(sizeof(float) * M * K);
    b = (float*)malloc(sizeof(float) * K * N);
    c = (float*)malloc(sizeof(float) * M * N);

    checkCudaErrors(cudaMalloc((void**) &d_a, sizeof(float) * M * K));
    checkCudaErrors(cudaMalloc((void**) &d_b, sizeof(float) * K * N));
    checkCudaErrors(cudaMalloc((void**) &d_c, sizeof(float) * M * N));

    for (int i = 0; i < M; i++) {
        for (int j = 0; j < K; j++) {
            a[i * K + j] = 1.0;
        }
    }
    for (int i = 0; i < K; i++) {
        for (int j = 0; j < N; j++) {
            b[i * N + j] = 1.0;
        }
    }
    for (int i = 0; i < M; i++) {
        for (int j = 0; j < N; j++) {
            c[i * N + j] = 1.0;
        }
    }

    cudaMemcpy(d_a, a, sizeof(float) * M * K, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, b, sizeof(float) * K * N, cudaMemcpyHostToDevice);
    cudaMemcpy(d_c, c, sizeof(float) * M * N, cudaMemcpyHostToDevice);

    // create as many blocks as necessary to map all of C
    dim3 gridDim(CEIL_DIV(M, 32), CEIL_DIV(N, 32), 1);
    // 32 * 32 = 1024 thread per block
    dim3 blockDim(32, 32, 1);
    // launch the asynchronous execution of the kernel on the device
    // The function call returns immediately on the host
    sgemm_naive<<<gridDim, blockDim>>>(M, N, K, alpha, d_a, d_b, beta, d_c);
    checkCudaErrors(cudaPeekAtLastError());
    checkCudaErrors(cudaDeviceSynchronize());

    cudaMemcpy(c, d_c, sizeof(float) * M * N, cudaMemcpyDeviceToHost);

    checkCudaErrors(cudaFree(d_a));
    checkCudaErrors(cudaFree(d_b));
    checkCudaErrors(cudaFree(d_c));

    // Verification
    printf("Done!\n");
    for(int i = 0; i < M * N; i++){
        assert(fabs(c[i] - alpha * K - beta) < MAX_ERR);
    }
    printf("c[0] = %f\n", c[0]);
    printf("PASSED\n");
}

## simd_matmul.rs
use std::time::{Duration, Instant};

use rayon::prelude::*;
use wide::f32x8;

const N: usize = 1024;
const M: usize = 1024;
const K: usize = 1024;

fn matmul_naive(a: &[f32], b: &[f32], c: &mut [f32]) {
    for i in 0..M {
        for j in 0..N {
            let mut temp = 0.0;
            for k in 0..K {
                temp += a[i * K + k] * b[k * N + j];
            }
            c[i * N + j] = temp;
        }
    }
}

fn matmul_threads(a: &[f32], b: &[f32], c: &mut [f32]) {
    c.par_chunks_exact_mut(N)
        .enumerate()
        .for_each(|(i, c_row)| {
            for j in 0..N {
                let mut temp: f32 = 0.0;
                for k in 0..K {
                    temp += a[i * K + k] * b[k * N + j];
                }
                c_row[j] = temp;
            }
        });
}

fn matmul_threads_transposed(a: &[f32], b: &[f32], c: &mut [f32]) {
    let mut b_transposed = vec![0.0; b.len()];
    for i in 0..K {
        for j in 0..N {
            b_transposed[j * K + i] = b[i * N + j];
        }
    }

    c.par_chunks_exact_mut(N)
        .enumerate()
        .for_each(|(i, c_row)| {
            for j in 0..N {
                let mut temp: f32 = 0.0;
                for k in 0..K {
                    temp += a[i * K + k] * b_transposed[j * K + k];
                }
                c_row[j] = temp;
            }
        });
}

fn matmul_threads_transposed_simd(a: &[f32], b: &[f32], c: &mut [f32]) {
    let mut b_transposed = vec![0.0; b.len()];
    for i in 0..K {
        for j in 0..N {
            b_transposed[j * K + i] = b[i * N + j];
        }
    }

    c.par_chunks_exact_mut(N)
        .enumerate()
        .for_each(|(i, c_row)| {
            for j in 0..N {
                // a: [ ...  < i*K .. i*K + K >  ... ]
                // b: [ ...  < j*K .. j*K + K >  ... ]

                let chunks = K / 8;
                let mut summand = f32x8::ZERO;

                unsafe {
                    let a_ptr = a.as_ptr().add(i * K) as *const f32x8;
                    let b_ptr = b_transposed.as_ptr().add(j * K) as *const f32x8;
                    for chunk in 0..chunks {
                        summand += *a_ptr.add(chunk) * *b_ptr.add(chunk);
                    }
                }
                let mut total_sum: f32 = summand.as_array_ref().into_iter().sum();
                for k in (chunks * 8)..K {
                    total_sum += a[i * K + k] * b_transposed[j * K + k];
                }

                c_row[j] = total_sum;
            }
        });
}

fn run_benchmark(iters: u32, f: impl Fn(&[f32], &[f32], &mut [f32])) {
    let mut total_time = Duration::ZERO;
    for _ in 0..iters {
        let a = vec![1.0_f32; M * K];
        let b = vec![1.0_f32; K * N];
        let mut c = vec![1.0_f32; M * N];

        let start = Instant::now();
        f(&a, &b, &mut c);
        total_time += start.elapsed();

        assert!(c.iter().all(|&x| (x - K as f32).abs() < 1e-6));
    }
    let t = total_time.div_f64(iters as f64);
    let ops = 2 * M * N * K;
    let gflops = 1e-9 * ops as f64 / t.as_secs_f64();
    println!("  .. took {t:?}");
    println!("  .. gflops {gflops:.2}");
}

fn main() {
    println!("Vanilla implementation");
    run_benchmark(1, matmul_naive);

    println!("Threaded implementation");
    run_benchmark(8, matmul_threads);

    println!("Threaded transposed implementation");
    run_benchmark(20, matmul_threads_transposed);

    println!("Threaded transposed simd implementation");
    run_benchmark(50, matmul_threads_transposed_simd);
}
	#include <assert.h>
	#include <stdio.h>
	#include <stdlib.h>

	#define MAX_ERR 1e-4

	#define CEIL_DIV(x, y) ((x) + (y) - 1) / (y)
	#define checkCudaErrors(call) \
	do { \
	cudaError_t err = call; \
	if (err != cudaSuccess) { \
	printf("CUDA error at %s %d: %s\n", __FILE__, __LINE__, \
	cudaGetErrorString(err)); \
	exit(EXIT_FAILURE); \
	} \
	} while (0)

	__global__ void sgemm_naive(int M, int N, int K, float alpha, const float *A,
	const float B, float beta, float C) {
	// compute position in C that this thread is responsible for
	const uint x = blockIdx.x * blockDim.y + threadIdx.y;
	const uint y = blockIdx.y * blockDim.x + threadIdx.x;

	// `if` condition is necessary for when M or N aren't multiples of 32.
	if (x < M && y < N) {
	float tmp = 0.0;
	for (int i = 0; i < K; ++i) {
	tmp += A[x * K + i] * B[i * N + y];
	}
	// C = α(A@B)+βC
	C[x * N + y] = alpha * tmp + beta * C[x * N + y];
	}
	}

	int main() {
	const int N = 4096;
	const int M = 4096;
	const int K = 4096;

	const float alpha = 0.5;
	const float beta = 0.7;

	float a, b, *c;
	float d_a, d_b, *d_c;

	// Allocate memory
	a = (float)malloc(sizeof(float) M * K);
	b = (float)malloc(sizeof(float) K * N);
	c = (float)malloc(sizeof(float) M * N);

	checkCudaErrors(cudaMalloc((void*) &d_a, sizeof(float) M * K));
	checkCudaErrors(cudaMalloc((void*) &d_b, sizeof(float) K * N));
	checkCudaErrors(cudaMalloc((void*) &d_c, sizeof(float) M * N));

	for (int i = 0; i < M; i++) {
	for (int j = 0; j < K; j++) {
	a[i * K + j] = 1.0;
	}
	}
	for (int i = 0; i < K; i++) {
	for (int j = 0; j < N; j++) {
	b[i * N + j] = 1.0;
	}
	}
	for (int i = 0; i < M; i++) {
	for (int j = 0; j < N; j++) {
	c[i * N + j] = 1.0;
	}
	}

	cudaMemcpy(d_a, a, sizeof(float) * M * K, cudaMemcpyHostToDevice);
	cudaMemcpy(d_b, b, sizeof(float) * K * N, cudaMemcpyHostToDevice);
	cudaMemcpy(d_c, c, sizeof(float) * M * N, cudaMemcpyHostToDevice);

	// create as many blocks as necessary to map all of C
	dim3 gridDim(CEIL_DIV(M, 32), CEIL_DIV(N, 32), 1);
	// 32 * 32 = 1024 thread per block
	dim3 blockDim(32, 32, 1);
	// launch the asynchronous execution of the kernel on the device
	// The function call returns immediately on the host
	sgemm_naive<<<gridDim, blockDim>>>(M, N, K, alpha, d_a, d_b, beta, d_c);
	checkCudaErrors(cudaPeekAtLastError());
	checkCudaErrors(cudaDeviceSynchronize());

	cudaMemcpy(c, d_c, sizeof(float) * M * N, cudaMemcpyDeviceToHost);

	checkCudaErrors(cudaFree(d_a));
	checkCudaErrors(cudaFree(d_b));
	checkCudaErrors(cudaFree(d_c));

	// Verification
	printf("Done!\n");
	for(int i = 0; i < M * N; i++){
	assert(fabs(c[i] - alpha * K - beta) < MAX_ERR);
	}
	printf("c[0] = %f\n", c[0]);
	printf("PASSED\n");
	}
	use std::time::{Duration, Instant};

	use rayon::prelude::*;
	use wide::f32x8;

	const N: usize = 1024;
	const M: usize = 1024;
	const K: usize = 1024;

	fn matmul_naive(a: &[f32], b: &[f32], c: &mut [f32]) {
	for i in 0..M {
	for j in 0..N {
	let mut temp = 0.0;
	for k in 0..K {
	temp += a[i * K + k] * b[k * N + j];
	}
	c[i * N + j] = temp;
	}
	}
	}

	fn matmul_threads(a: &[f32], b: &[f32], c: &mut [f32]) {
	c.par_chunks_exact_mut(N)
	.enumerate()
	.for_each(\|(i, c_row)\| {
	for j in 0..N {
	let mut temp: f32 = 0.0;
	for k in 0..K {
	temp += a[i * K + k] * b[k * N + j];
	}
	c_row[j] = temp;
	}
	});
	}

	fn matmul_threads_transposed(a: &[f32], b: &[f32], c: &mut [f32]) {
	let mut b_transposed = vec![0.0; b.len()];
	for i in 0..K {
	for j in 0..N {
	b_transposed[j * K + i] = b[i * N + j];
	}
	}

	c.par_chunks_exact_mut(N)
	.enumerate()
	.for_each(\|(i, c_row)\| {
	for j in 0..N {
	let mut temp: f32 = 0.0;
	for k in 0..K {
	temp += a[i * K + k] * b_transposed[j * K + k];
	}
	c_row[j] = temp;
	}
	});
	}

	fn matmul_threads_transposed_simd(a: &[f32], b: &[f32], c: &mut [f32]) {
	let mut b_transposed = vec![0.0; b.len()];
	for i in 0..K {
	for j in 0..N {
	b_transposed[j * K + i] = b[i * N + j];
	}
	}

	c.par_chunks_exact_mut(N)
	.enumerate()
	.for_each(\|(i, c_row)\| {
	for j in 0..N {
	// a: [ ... < iK .. iK + K > ... ]
	// b: [ ... < jK .. jK + K > ... ]

	let chunks = K / 8;
	let mut summand = f32x8::ZERO;

	unsafe {
	let a_ptr = a.as_ptr().add(i * K) as *const f32x8;
	let b_ptr = b_transposed.as_ptr().add(j * K) as *const f32x8;
	for chunk in 0..chunks {
	summand += a_ptr.add(chunk) *b_ptr.add(chunk);
	}
	}
	let mut total_sum: f32 = summand.as_array_ref().into_iter().sum();
	for k in (chunks * 8)..K {
	total_sum += a[i * K + k] * b_transposed[j * K + k];
	}

	c_row[j] = total_sum;
	}
	});
	}

	fn run_benchmark(iters: u32, f: impl Fn(&[f32], &[f32], &mut [f32])) {
	let mut total_time = Duration::ZERO;
	for _ in 0..iters {
	let a = vec![1.0_f32; M * K];
	let b = vec![1.0_f32; K * N];
	let mut c = vec![1.0_f32; M * N];

	let start = Instant::now();
	f(&a, &b, &mut c);
	total_time += start.elapsed();

	assert!(c.iter().all(\|&x\| (x - K as f32).abs() < 1e-6));
	}
	let t = total_time.div_f64(iters as f64);
	let ops = 2 * M * N * K;
	let gflops = 1e-9 * ops as f64 / t.as_secs_f64();
	println!(" .. took {t:?}");
	println!(" .. gflops {gflops:.2}");
	}

	fn main() {
	println!("Vanilla implementation");
	run_benchmark(1, matmul_naive);

	println!("Threaded implementation");
	run_benchmark(8, matmul_threads);

	println!("Threaded transposed implementation");
	run_benchmark(20, matmul_threads_transposed);

	println!("Threaded transposed simd implementation");
	run_benchmark(50, matmul_threads_transposed_simd);
	}