Skip to content

Instantly share code, notes, and snippets.

@pankkor
Last active February 26, 2023 17:52
Show Gist options
  • Save pankkor/d414d42e9fd6c2376311871324e93f3c to your computer and use it in GitHub Desktop.
Save pankkor/d414d42e9fd6c2376311871324e93f3c to your computer and use it in GitHub Desktop.
Benchmark u32 array sum(); CFLAGS = -std=c11 -Wall -Wextra -Wpedantic -O3 -mcpu=apple-m1
#ifndef __aarch64__
#error Architectures other than arm64 are not supported
#endif // #ifndef __aarch64__
#include <arm_neon.h>
#include <stdio.h> // printf
typedef int i32;
typedef unsigned int u32;
typedef long i64;
typedef unsigned long u64;
typedef float f32;
typedef double f64;
#define BENCH_ESCAPE(p) __asm__ volatile("" : : "m"(p) : "memory")
#define BENCH_CLOBBER() __asm__ volatile("" : : : "memory")
#define ALIGNED(x) __attribute__((aligned(x)))
#define FORCE_INLINE inline __attribute__((always_inline))
#define ARRAY_COUNT(x) (u64)(sizeof(x) / sizeof(x[0]))
#define DSB(option) __asm__ volatile ("dsb " #option : : : "memory")
#define NO_OPT __attribute__((optnone))
#define LOOP_NO_OPT _Pragma(\
"clang loop unroll(disable) vectorize(disable) interleave(disable)")
static FORCE_INLINE u64 rdtsc(void) {
u64 val;
// use isb to avoid speculative read of cntvct_el0
__asm__ volatile("isb;\n\tmrs %0, cntvct_el0" : "=r" (val) :: "memory");
return val;
}
static FORCE_INLINE u64 tsc_freq(void) {
u64 val;
__asm__ volatile("mrs %0, cntfrq_el0" : "=r" (val));
return val;
}
static FORCE_INLINE f64 tsc_to_s(f64 tsc) {
return tsc / tsc_freq();
}
static u64 benchmark(void (func)(void), i32 count) {
u64 min_cycles = -1;
for (i32 i = 0; i < count; ++i) {
u64 beg = rdtsc();
DSB(nsh);
func();
DSB(nsh);
u64 end = rdtsc();
u64 cycles = end - beg;
if (cycles < min_cycles) {
min_cycles = cycles;
}
}
return min_cycles;
}
// Note: it's not trivial to obtain CPU frequency on Apple silicon.
// Core freqency varies depending on the type of a core,
// how many cores are active within the cluster and core's power state.
// To keep things simple just hardcore 3237Mhz
// (highest frequency value for M1 Pro obtained from IORegistry)
u64 CPU_FREQ_HZ = 3237460535;
static void benchmark_print_result(
const char *benchmark_name,
f64 tsc,
i32 iters) {
f64 ns = tsc_to_s(tsc * 1e9);
f64 ipc = 1.0 / tsc_to_s(CPU_FREQ_HZ * tsc);
printf("--- %s ---\n", benchmark_name);
printf("Total time %.8fns\n", ns);
printf("Time per iteration %.8fns\n", ns / iters);
printf("Instructions per cycle %.8f\n\n", ipc);
fflush(stdout);
}
// benchmnarks
static FORCE_INLINE u32 sum_naive(i32 count, u32 * restrict in) {
u32 sum = 0;
LOOP_NO_OPT
for (i32 i = 0; i < count; ++i) {
sum += in[i];
}
return sum;
}
static FORCE_INLINE u32 sum_unroll2(i32 count, u32 * restrict in) {
u32 sum0 = 0;
u32 sum1 = 0;
u32 *it = in;
u32 *end = in + count - 1;
LOOP_NO_OPT
while (it < end) {
sum0 += it[0];
sum1 += it[1];
it += 2;
}
return sum0 + sum1;
}
static FORCE_INLINE u32 sum_unroll4(i32 count, u32 * restrict in) {
u32 sum0 = 0;
u32 sum1 = 0;
u32 sum2 = 0;
u32 sum3 = 0;
u32 *it = in;
u32 *end = in + count - 3;
LOOP_NO_OPT
while (it < end) {
sum0 += it[0];
sum1 += it[1];
sum2 += it[2];
sum3 += it[3];
it += 4;
}
return sum0 + sum1 + sum2 + sum3;
}
static FORCE_INLINE u32 sum_unroll8(i32 count, u32 * restrict in) {
u32 sum0 = 0;
u32 sum1 = 0;
u32 sum2 = 0;
u32 sum3 = 0;
u32 sum4 = 0;
u32 sum5 = 0;
u32 sum6 = 0;
u32 sum7 = 0;
u32 *it = in;
u32 *end = in + count - 7;
LOOP_NO_OPT
while (it < end) {
sum0 += it[0];
sum1 += it[1];
sum2 += it[2];
sum3 += it[3];
sum4 += it[4];
sum5 += it[5];
sum6 += it[6];
sum7 += it[7];
it += 8;
}
return sum0 + sum1 + sum2 + sum3 + sum4 + sum5 + sum6 + sum7;
}
static FORCE_INLINE u32 sum_neon4(i32 count, u32 * restrict in) {
uint32x4_t sum0 = vdupq_n_u32(0);
u32 *it = in;
u32 *end = in + count - 3;
while (it < end) {
uint32x4_t v0 = vld1q_u32(it);
sum0 = vaddq_u32(sum0, v0);
it += 4;
}
return vaddvq_u32(sum0);
}
static FORCE_INLINE u32 sum_neon8(i32 count, u32 * restrict in) {
uint32x4_t sum0 = vdupq_n_u32(0);
uint32x4_t sum1 = vdupq_n_u32(0);
u32 *it = in;
u32 *end = in + count - 7;
while (it < end) {
uint32x4_t v0 = vld1q_u32(it + 0);
uint32x4_t v1 = vld1q_u32(it + 4);
sum0 = vaddq_u32(sum0, v0);
sum1 = vaddq_u32(sum1, v1);
it += 8;
}
uint32x4_t t = vaddq_u32(sum0, sum1);
return vaddvq_u32(t);
}
static FORCE_INLINE u32 sum_neon16(i32 count, u32 * restrict in) {
uint32x4_t sum0 = vdupq_n_u32(0);
uint32x4_t sum1 = vdupq_n_u32(0);
uint32x4_t sum2 = vdupq_n_u32(0);
uint32x4_t sum3 = vdupq_n_u32(0);
u32 *it = in;
u32 *end = in + count - 15;
while (it < end) {
// Note: using 4 vld1q_u32 intrinsics results in two ldp asm instructions
// using vld2q or vld4q is 2 times slower than ldp
uint32x4_t v0 = vld1q_u32(it + 0);
uint32x4_t v1 = vld1q_u32(it + 4);
uint32x4_t v2 = vld1q_u32(it + 8);
uint32x4_t v3 = vld1q_u32(it + 12);
sum0 = vaddq_u32(sum0, v0);
sum1 = vaddq_u32(sum1, v1);
sum2 = vaddq_u32(sum2, v2);
sum3 = vaddq_u32(sum3, v3);
it += 16;
}
uint32x4_t t0 = vaddq_u32(sum0, sum1);
uint32x4_t t1 = vaddq_u32(sum2, sum3);
uint32x4_t t = vaddq_u32(t0, t1);
return vaddvq_u32(t);
}
static FORCE_INLINE u32 sum_neon32(i32 count, u32 * restrict in) {
uint32x4_t sum0 = vdupq_n_u32(0);
uint32x4_t sum1 = vdupq_n_u32(0);
uint32x4_t sum2 = vdupq_n_u32(0);
uint32x4_t sum3 = vdupq_n_u32(0);
uint32x4_t sum4 = vdupq_n_u32(0);
uint32x4_t sum5 = vdupq_n_u32(0);
uint32x4_t sum6 = vdupq_n_u32(0);
uint32x4_t sum7 = vdupq_n_u32(0);
u32 *it = in;
u32 *end = in + count - 31;
while (it < end) {
// Note: using 4 vld1q_u32 intrinsics results in two ldp asm instructions
// using vld2q or vld4q is 2 times slower than ldp
uint32x4_t v0 = vld1q_u32(it + 0);
uint32x4_t v1 = vld1q_u32(it + 4);
uint32x4_t v2 = vld1q_u32(it + 8);
uint32x4_t v3 = vld1q_u32(it + 12);
uint32x4_t v4 = vld1q_u32(it + 16);
uint32x4_t v5 = vld1q_u32(it + 20);
uint32x4_t v6 = vld1q_u32(it + 24);
uint32x4_t v7 = vld1q_u32(it + 28);
sum0 = vaddq_u32(sum0, v0);
sum1 = vaddq_u32(sum1, v1);
sum2 = vaddq_u32(sum2, v2);
sum3 = vaddq_u32(sum3, v3);
sum4 = vaddq_u32(sum4, v4);
sum5 = vaddq_u32(sum5, v5);
sum6 = vaddq_u32(sum6, v6);
sum7 = vaddq_u32(sum7, v7);
it += 32;
}
uint32x4_t t0 = vaddq_u32(sum0, sum1);
uint32x4_t t1 = vaddq_u32(sum2, sum3);
uint32x4_t t2 = vaddq_u32(sum4, sum5);
uint32x4_t t3 = vaddq_u32(sum6, sum7);
uint32x4_t t01 = vaddq_u32(t0, t1);
uint32x4_t t23 = vaddq_u32(t2, t3);
uint32x4_t t = vaddq_u32(t01, t23);
return vaddvq_u32(t);
}
ALIGNED(128) u32 arr[4096] = {0};
#define GEN_BENCHMARK_FUNC_SUM(name) \
void benchmark_##name(void) { \
u32 sum = 0; \
BENCH_ESCAPE(sum); \
sum = (name)(ARRAY_COUNT(arr), arr); \
BENCH_CLOBBER(); \
}
GEN_BENCHMARK_FUNC_SUM(sum_naive)
GEN_BENCHMARK_FUNC_SUM(sum_unroll2)
GEN_BENCHMARK_FUNC_SUM(sum_unroll4)
GEN_BENCHMARK_FUNC_SUM(sum_unroll8)
GEN_BENCHMARK_FUNC_SUM(sum_neon4)
GEN_BENCHMARK_FUNC_SUM(sum_neon8)
GEN_BENCHMARK_FUNC_SUM(sum_neon16)
GEN_BENCHMARK_FUNC_SUM(sum_neon32)
int main() {
printf("Approx CPU freq %ldHz (%.0fMHz)\n",
CPU_FREQ_HZ, CPU_FREQ_HZ * 1e-6);
printf("Time Stamp Counter freq %ldHz (%.0fMHz)\n",
tsc_freq(), tsc_freq() * 1e-6);
printf("\nBenchmarks\n");
// touch array and assign some values to it
for (u32 i = 0; i < ARRAY_COUNT(arr); ++i) {
arr[i] = i * 10;
}
enum {BENCH_COUNT = 2 * 1000 * 1000};
struct {
const char *name;
void (*func)(void);
} benchmarks[] = {
{"sum_naive", benchmark_sum_naive},
{"sum_unroll2", benchmark_sum_unroll2},
{"sum_unroll4", benchmark_sum_unroll4},
{"sum_unroll8", benchmark_sum_unroll8},
{"sum_neon4", benchmark_sum_neon4},
{"sum_neon8", benchmark_sum_neon8},
{"sum_neon16", benchmark_sum_neon16},
{"sum_neon32", benchmark_sum_neon32},
};
for (u64 i = 0; i < ARRAY_COUNT(benchmarks); ++i) {
benchmark_print_result(
benchmarks[i].name,
(f64)benchmark(benchmarks[i].func, BENCH_COUNT) / ARRAY_COUNT(arr),
BENCH_COUNT);
}
return 0;
}
@pankkor
Copy link
Author

pankkor commented Feb 14, 2023

Note: clang has barriers and special registers read builtins

__builtin_arm_isb(ISB_SY);
__builtin_arm_rsr64("CNTVCT_EL0");

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment