Skip to content

Instantly share code, notes, and snippets.

@Peter0x44
Created November 13, 2025 05:59
Show Gist options
  • Select an option

  • Save Peter0x44/ad462d28c05641230ec2727972687020 to your computer and use it in GitHub Desktop.

Select an option

Save Peter0x44/ad462d28c05641230ec2727972687020 to your computer and use it in GitHub Desktop.
#if 0
# Self-building Matrix Multiplication Benchmark
# Execute this file with: sh benchmark_4x4_matmul.c
CC_GCC=${CC_GCC:-gcc}
CC_CLANG=${CC_CLANG:-clang}
echo "=== Building Matrix Multiplication Benchmark ==="
echo "GCC version: $($CC_GCC --version | head -1)"
echo "Clang version: $($CC_CLANG --version | head -1)"
echo ""
# Compile with GCC - will create mat4x4_mul_gcc function
echo "Compiling mat4x4_mul_gcc.o with GCC..."
$CC_GCC -O3 -march=native -c -o mat4x4_mul_gcc.o $0 || { echo "GCC compilation failed"; exit 1; }
# Compile with Clang - will create mat4x4_mul_clang function
echo "Compiling mat4x4_mul_clang.o with Clang..."
$CC_CLANG -O3 -march=native -c -o mat4x4_mul_clang.o $0 || { echo "Clang compilation failed"; exit 1; }
# Compile test harness and link both object files
echo "Compiling benchmark test harness..."
$CC_GCC -DTEST_HARNESS -O3 -march=native -Wall -o benchmark $0 mat4x4_mul_gcc.o mat4x4_mul_clang.o -lm || { echo "Link failed"; exit 1; }
echo ""
echo "=== Running Benchmark ==="
./benchmark
exit 0
#endif
#ifndef TEST_HARNESS
// Matrix multiplication implementations - only compile when NOT building test harness
#ifdef __clang__
void mat4x4_mul_clang(float * restrict A, float * restrict B, float * restrict C) {
for (int i = 0; i < 4; i++) {
for (int j = 0; j < 4; j++) {
float sum = 0.0f;
for (int k = 0; k < 4; k++) {
sum += A[i * 4 + k] * B[k * 4 + j];
}
C[i * 4 + j] = sum;
}
}
}
#elif defined(__GNUC__)
void mat4x4_mul_gcc(float * restrict A, float * restrict B, float * restrict C) {
for (int i = 0; i < 4; i++) {
for (int j = 0; j < 4; j++) {
float sum = 0.0f;
for (int k = 0; k < 4; k++) {
sum += A[i * 4 + k] * B[k * 4 + j];
}
C[i * 4 + j] = sum;
}
}
}
#endif
#else // TEST_HARNESS
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <time.h>
#include <math.h>
#define ITERATIONS 10000000000LL
#define WARMUP_ITERATIONS 10000000LL
void mat4x4_mul_gcc(float *A, float *B, float *C);
void mat4x4_mul_clang(float *A, float *B, float *C);
void init_matrix(float *mat) {
for (int i = 0; i < 16; i++) {
mat[i] = (float)(i + 1);
}
}
void print_matrix(const char *name, float *mat) {
printf("%s:\n", name);
for (int i = 0; i < 4; i++) {
printf(" [");
for (int j = 0; j < 4; j++) {
printf("%8.3f", mat[i * 4 + j]);
if (j < 3) printf(", ");
}
printf("]\n");
}
}
int matrices_equal(float *a, float *b) {
for (int i = 0; i < 16; i++) {
if (a[i] != b[i]) {
return 0;
}
}
return 1;
}
int64_t get_nanos() {
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC, &ts);
return (int64_t)ts.tv_sec * 1000000000LL + ts.tv_nsec;
}
int main(void) {
__attribute__((aligned(32))) float A[16];
__attribute__((aligned(32))) float B[16];
__attribute__((aligned(32))) float C_gcc[16];
__attribute__((aligned(32))) float C_clang[16];
init_matrix(A);
init_matrix(B);
printf("=== Matrix Multiplication Benchmark ===\n");
printf("Comparing GCC vs Clang implementations\n");
printf("Matrix size: 4x4\n");
printf("Warmup iterations: %lld\n", WARMUP_ITERATIONS);
printf("Benchmark iterations: %lld\n\n", ITERATIONS);
printf("Verifying correctness...\n");
mat4x4_mul_gcc(A, B, C_gcc);
mat4x4_mul_clang(A, B, C_clang);
if (matrices_equal(C_gcc, C_clang)) {
printf("✓ Both implementations produce identical results\n\n");
} else {
printf("✗ WARNING: Results differ!\n");
print_matrix("GCC result", C_gcc);
print_matrix("Clang result", C_clang);
printf("\n");
}
printf("Warming up GCC implementation...\n");
for (int64_t i = 0; i < WARMUP_ITERATIONS; i++) {
mat4x4_mul_gcc(A, B, C_gcc);
}
printf("Benchmarking GCC implementation...\n");
int64_t start = get_nanos();
for (int64_t i = 0; i < ITERATIONS; i++) {
mat4x4_mul_gcc(A, B, C_gcc);
}
int64_t end = get_nanos();
int64_t gcc_time = end - start;
double gcc_ns_per_op = (double)gcc_time / ITERATIONS;
// Warmup and benchmark Clang version
printf("\nWarming up Clang implementation...\n");
for (int64_t i = 0; i < WARMUP_ITERATIONS; i++) {
mat4x4_mul_clang(A, B, C_clang);
}
printf("Benchmarking Clang implementation...\n");
start = get_nanos();
for (int64_t i = 0; i < ITERATIONS; i++) {
mat4x4_mul_clang(A, B, C_clang);
}
end = get_nanos();
int64_t clang_time = end - start;
double clang_ns_per_op = (double)clang_time / ITERATIONS;
printf("\n=== Results ===\n");
printf("GCC version:\n");
printf(" Total time: %.3f seconds\n", gcc_time / 1e9);
printf(" Time per operation: %.2f ns\n", gcc_ns_per_op);
printf("\n");
printf("Clang version:\n");
printf(" Total time: %.3f seconds\n", clang_time / 1e9);
printf(" Time per operation: %.2f ns\n", clang_ns_per_op);
printf("\n");
if (gcc_ns_per_op < clang_ns_per_op) {
double speedup = clang_ns_per_op / gcc_ns_per_op;
printf("GCC is %.2fx faster (%.1f%% faster)\n",
speedup, (speedup - 1.0) * 100.0);
} else {
double speedup = gcc_ns_per_op / clang_ns_per_op;
printf("Clang is %.2fx faster (%.1f%% faster)\n",
speedup, (speedup - 1.0) * 100.0);
}
return 0;
}
#endif
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment