Created
November 13, 2025 05:59
-
-
Save Peter0x44/ad462d28c05641230ec2727972687020 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #if 0 | |
| # Self-building Matrix Multiplication Benchmark | |
| # Execute this file with: sh benchmark_4x4_matmul.c | |
| CC_GCC=${CC_GCC:-gcc} | |
| CC_CLANG=${CC_CLANG:-clang} | |
| echo "=== Building Matrix Multiplication Benchmark ===" | |
| echo "GCC version: $($CC_GCC --version | head -1)" | |
| echo "Clang version: $($CC_CLANG --version | head -1)" | |
| echo "" | |
| # Compile with GCC - will create mat4x4_mul_gcc function | |
| echo "Compiling mat4x4_mul_gcc.o with GCC..." | |
| $CC_GCC -O3 -march=native -c -o mat4x4_mul_gcc.o $0 || { echo "GCC compilation failed"; exit 1; } | |
| # Compile with Clang - will create mat4x4_mul_clang function | |
| echo "Compiling mat4x4_mul_clang.o with Clang..." | |
| $CC_CLANG -O3 -march=native -c -o mat4x4_mul_clang.o $0 || { echo "Clang compilation failed"; exit 1; } | |
| # Compile test harness and link both object files | |
| echo "Compiling benchmark test harness..." | |
| $CC_GCC -DTEST_HARNESS -O3 -march=native -Wall -o benchmark $0 mat4x4_mul_gcc.o mat4x4_mul_clang.o -lm || { echo "Link failed"; exit 1; } | |
| echo "" | |
| echo "=== Running Benchmark ===" | |
| ./benchmark | |
| exit 0 | |
| #endif | |
| #ifndef TEST_HARNESS | |
| // Matrix multiplication implementations - only compile when NOT building test harness | |
| #ifdef __clang__ | |
| void mat4x4_mul_clang(float * restrict A, float * restrict B, float * restrict C) { | |
| for (int i = 0; i < 4; i++) { | |
| for (int j = 0; j < 4; j++) { | |
| float sum = 0.0f; | |
| for (int k = 0; k < 4; k++) { | |
| sum += A[i * 4 + k] * B[k * 4 + j]; | |
| } | |
| C[i * 4 + j] = sum; | |
| } | |
| } | |
| } | |
| #elif defined(__GNUC__) | |
| void mat4x4_mul_gcc(float * restrict A, float * restrict B, float * restrict C) { | |
| for (int i = 0; i < 4; i++) { | |
| for (int j = 0; j < 4; j++) { | |
| float sum = 0.0f; | |
| for (int k = 0; k < 4; k++) { | |
| sum += A[i * 4 + k] * B[k * 4 + j]; | |
| } | |
| C[i * 4 + j] = sum; | |
| } | |
| } | |
| } | |
| #endif | |
| #else // TEST_HARNESS | |
| #include <stdio.h> | |
| #include <stdlib.h> | |
| #include <stdint.h> | |
| #include <time.h> | |
| #include <math.h> | |
| #define ITERATIONS 10000000000LL | |
| #define WARMUP_ITERATIONS 10000000LL | |
| void mat4x4_mul_gcc(float *A, float *B, float *C); | |
| void mat4x4_mul_clang(float *A, float *B, float *C); | |
| void init_matrix(float *mat) { | |
| for (int i = 0; i < 16; i++) { | |
| mat[i] = (float)(i + 1); | |
| } | |
| } | |
| void print_matrix(const char *name, float *mat) { | |
| printf("%s:\n", name); | |
| for (int i = 0; i < 4; i++) { | |
| printf(" ["); | |
| for (int j = 0; j < 4; j++) { | |
| printf("%8.3f", mat[i * 4 + j]); | |
| if (j < 3) printf(", "); | |
| } | |
| printf("]\n"); | |
| } | |
| } | |
| int matrices_equal(float *a, float *b) { | |
| for (int i = 0; i < 16; i++) { | |
| if (a[i] != b[i]) { | |
| return 0; | |
| } | |
| } | |
| return 1; | |
| } | |
| int64_t get_nanos() { | |
| struct timespec ts; | |
| clock_gettime(CLOCK_MONOTONIC, &ts); | |
| return (int64_t)ts.tv_sec * 1000000000LL + ts.tv_nsec; | |
| } | |
| int main(void) { | |
| __attribute__((aligned(32))) float A[16]; | |
| __attribute__((aligned(32))) float B[16]; | |
| __attribute__((aligned(32))) float C_gcc[16]; | |
| __attribute__((aligned(32))) float C_clang[16]; | |
| init_matrix(A); | |
| init_matrix(B); | |
| printf("=== Matrix Multiplication Benchmark ===\n"); | |
| printf("Comparing GCC vs Clang implementations\n"); | |
| printf("Matrix size: 4x4\n"); | |
| printf("Warmup iterations: %lld\n", WARMUP_ITERATIONS); | |
| printf("Benchmark iterations: %lld\n\n", ITERATIONS); | |
| printf("Verifying correctness...\n"); | |
| mat4x4_mul_gcc(A, B, C_gcc); | |
| mat4x4_mul_clang(A, B, C_clang); | |
| if (matrices_equal(C_gcc, C_clang)) { | |
| printf("✓ Both implementations produce identical results\n\n"); | |
| } else { | |
| printf("✗ WARNING: Results differ!\n"); | |
| print_matrix("GCC result", C_gcc); | |
| print_matrix("Clang result", C_clang); | |
| printf("\n"); | |
| } | |
| printf("Warming up GCC implementation...\n"); | |
| for (int64_t i = 0; i < WARMUP_ITERATIONS; i++) { | |
| mat4x4_mul_gcc(A, B, C_gcc); | |
| } | |
| printf("Benchmarking GCC implementation...\n"); | |
| int64_t start = get_nanos(); | |
| for (int64_t i = 0; i < ITERATIONS; i++) { | |
| mat4x4_mul_gcc(A, B, C_gcc); | |
| } | |
| int64_t end = get_nanos(); | |
| int64_t gcc_time = end - start; | |
| double gcc_ns_per_op = (double)gcc_time / ITERATIONS; | |
| // Warmup and benchmark Clang version | |
| printf("\nWarming up Clang implementation...\n"); | |
| for (int64_t i = 0; i < WARMUP_ITERATIONS; i++) { | |
| mat4x4_mul_clang(A, B, C_clang); | |
| } | |
| printf("Benchmarking Clang implementation...\n"); | |
| start = get_nanos(); | |
| for (int64_t i = 0; i < ITERATIONS; i++) { | |
| mat4x4_mul_clang(A, B, C_clang); | |
| } | |
| end = get_nanos(); | |
| int64_t clang_time = end - start; | |
| double clang_ns_per_op = (double)clang_time / ITERATIONS; | |
| printf("\n=== Results ===\n"); | |
| printf("GCC version:\n"); | |
| printf(" Total time: %.3f seconds\n", gcc_time / 1e9); | |
| printf(" Time per operation: %.2f ns\n", gcc_ns_per_op); | |
| printf("\n"); | |
| printf("Clang version:\n"); | |
| printf(" Total time: %.3f seconds\n", clang_time / 1e9); | |
| printf(" Time per operation: %.2f ns\n", clang_ns_per_op); | |
| printf("\n"); | |
| if (gcc_ns_per_op < clang_ns_per_op) { | |
| double speedup = clang_ns_per_op / gcc_ns_per_op; | |
| printf("GCC is %.2fx faster (%.1f%% faster)\n", | |
| speedup, (speedup - 1.0) * 100.0); | |
| } else { | |
| double speedup = gcc_ns_per_op / clang_ns_per_op; | |
| printf("Clang is %.2fx faster (%.1f%% faster)\n", | |
| speedup, (speedup - 1.0) * 100.0); | |
| } | |
| return 0; | |
| } | |
| #endif |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment