Last active
December 13, 2021 04:07
-
-
Save dmbfm/0c7e868624a3f8e390348b9f2cf3d7a7 to your computer and use it in GitHub Desktop.
Performance counters on apple M1 processors using Zig!
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const std = @import("std"); | |
// Sources: | |
// - https://github.com/lemire/Code-used-on-Daniel-Lemire-s-blog/blob/master/2021/03/24/m1cycles.cpp | |
// - https://gitea.com/matteyeux/darwin-xnu/src/branch/master/osfmk/kperf | |
// - https://patchwork.ffmpeg.org/project/ffmpeg/patch/20210413004523.6500-1-josh@itanimul.li/ | |
// - https://github.com/GMUCERG/PQC_NEON/blob/main/neon/kyber/m1cycles.c | |
const c = @cImport({ | |
@cInclude("pthread.h"); | |
}); | |
const DynLib = std.DynLib; | |
const TypeInfo = std.builtin.TypeInfo; | |
const dlopen = std.c.dlopen; | |
const dlsym = std.c.dlsym; | |
const dlclose = std.c.dlclose; | |
const KpcClass = struct { | |
const Fixed = 0; | |
const Configurable = 1; | |
const Power = 2; | |
const RawPmu = 3; | |
const FixedMask = 1 << Fixed; | |
const ConfigurableMask = 1 << Configurable; | |
const PowerMask = 1 << Power; | |
const RawPmuMask = 1 << RawPmu; | |
}; | |
const Cpmu = struct { | |
const NONE = 0; | |
const CORE_CYCLE = 0x02; | |
const INST_A64 = 0x8c; | |
const INST_BRANCH = 0x8d; | |
const SYNC_DC_LOAD_MISS = 0xbf; | |
const SYNC_DC_STORE_MISS = 0xc0; | |
const SYNC_DTLB_MISS = 0xc1; | |
const SYNC_ST_HIT_YNGR_LD = 0xc4; | |
const SYNC_BR_ANY_MISP = 0xcb; | |
const FED_IC_MISS_DEM = 0xd3; | |
const FED_ITLB_MISS = 0xd4; | |
}; | |
const CfgWord = struct { | |
const EL0A32EN_MASK = 0x10000; | |
const EL0A64EN_MASK = 0x20000; | |
const EL1EN_MASK = 0x40000; | |
const EL3EN_MASK = 0x80000; | |
const ALLMODES_MASK = 0xf0000; | |
}; | |
const KpcMask = KpcClass.ConfigurableMask | KpcClass.FixedMask; | |
const CountersCount = 10; | |
const ConfigCount = 8; | |
const KpcGetCounterCountFn = fn (u32) u32; | |
const KpcGetConfigCountFn = fn (u32) u32; | |
const KpcGetCountingFn = fn () c_int; | |
const KpcForceAllCtrsSetFn = fn (c_int) c_int; | |
const KpcSetCountingFn = fn (u32) c_int; | |
const KpcSetThreadCountingFn = fn (u32) c_int; | |
const KpcSetConfigFn = fn (u32, u64) c_int; | |
const KpcGetConfigFn = fn (u32, *c_void) c_int; | |
const KpcSetPeriodFn = fn (u32, *c_void) c_int; | |
const KpcGetPeriodFn = fn (u32, *c_void) c_int; | |
const KpcPerfSampleGetFn = fn (*c_int) c_int; | |
const KpcGetThreadCountersFn = fn (c_int, c_uint, u64) c_int; | |
const kpc_functions_map = [_]struct { name: [:0]const u8, type: type }{ | |
.{ .name = "kpc_get_counter_count", .type = KpcGetCounterCountFn }, | |
.{ .name = "kpc_get_config_count", .type = KpcGetCounterCountFn }, | |
.{ .name = "kpc_get_counting", .type = KpcGetCountingFn }, | |
.{ .name = "kpc_force_all_ctrs_set", .type = KpcForceAllCtrsSetFn }, | |
.{ .name = "kpc_set_counting", .type = KpcSetCountingFn }, | |
.{ .name = "kpc_set_thread_counting", .type = KpcSetThreadCountingFn }, | |
.{ .name = "kpc_set_config", .type = KpcSetConfigFn }, | |
.{ .name = "kpc_get_config", .type = KpcGetConfigFn }, | |
.{ .name = "kpc_set_period", .type = KpcSetPeriodFn }, | |
.{ .name = "kpc_get_period", .type = KpcGetPeriodFn }, | |
.{ .name = "kperf_sample_get", .type = KpcForceAllCtrsSetFn }, | |
.{ .name = "kpc_get_thread_counters", .type = KpcGetThreadCountersFn }, | |
}; | |
var kpc_config: [CountersCount]u64 = [_]u64{ | |
Cpmu.CORE_CYCLE | CfgWord.EL0A64EN_MASK, | |
0, | |
0, | |
Cpmu.INST_BRANCH | CfgWord.EL0A64EN_MASK, | |
Cpmu.SYNC_BR_ANY_MISP | CfgWord.EL0A64EN_MASK, | |
Cpmu.INST_A64 | CfgWord.EL0A64EN_MASK, | |
0, | |
0, | |
0, | |
0, | |
}; | |
var kpc_counters: [CountersCount]u64 = undefined; | |
// TODO: Report this bug. When I use this as a global I get a compiler crash: | |
// Assertion failed at /Users/daniel/Sources/zig/src/stage1/analyze.cpp:8753 in resolve_llvm_types_struct. This is a bug in the Zig compiler.thread 6387290 panic: | |
// | |
// That is why I use the KpcApiWrapper struct instead. | |
// | |
// var api: KpcApi = undefined; | |
const KpcApiWrapper = struct { api: KpcApi }; | |
var kpc_api: KpcApiWrapper = undefined; | |
/// Generates the KpcApi struct based on the contents of the `kpc_functions_map` array. | |
/// This isn't really necessary, I did this mainly to learn to generate a struct at comptime with Zig. | |
const KpcApi = kpc_api_blk: { | |
const fields = blk: { | |
const N = @typeInfo(@TypeOf(kpc_functions_map)).Array.len; | |
var result: [N]TypeInfo.StructField = undefined; | |
inline for (kpc_functions_map) |item, i| { | |
result[i] = .{ | |
.name = item.name, | |
.field_type = item.type, | |
.default_value = null, | |
.is_comptime = false, | |
.alignment = @alignOf(item.type), | |
}; | |
} | |
break :blk result; | |
}; | |
const type_info = blk: { | |
break :blk std.builtin.TypeInfo{ | |
.Struct = .{ | |
.layout = .Auto, | |
.fields = &fields, | |
.decls = &[_]std.builtin.TypeInfo.Declaration{}, | |
.is_tuple = false, | |
}, | |
}; | |
}; | |
break :kpc_api_blk @Type(type_info); | |
}; | |
pub fn initRdtsc() !void { | |
var kperf = try std.DynLib.open("/System/Library/PrivateFrameworks/kperf.framework/Versions/A/kperf"); | |
var api: KpcApi = undefined; | |
inline for (kpc_functions_map) |item| { | |
@field(api, item.name) = kperf.lookup(item.type, item.name).?; | |
std.log.info("{s}: {}", .{ item.name, @field(api, item.name) }); | |
} | |
kpc_api.api = api; | |
if (api.kpc_get_counter_count(KpcMask) != CountersCount) { | |
return error.WrongCounterCount; | |
} | |
if (api.kpc_get_config_count(KpcMask) != ConfigCount) { | |
return error.WrongConfigCount; | |
} | |
if (api.kpc_set_config(KpcMask, @ptrToInt(&kpc_config[0])) != 0) { | |
return error.KpcSetConfigFailed; | |
} | |
if (api.kpc_force_all_ctrs_set(1) != 0) { | |
return error.KpcForceAllCtrsSetFailed; | |
} | |
if (api.kpc_set_counting(KpcMask) != 0) { | |
return error.KpcSetCountingFailed; | |
} | |
if (api.kpc_set_thread_counting(KpcMask) != 0) { | |
return error.KpcSetThreadCountingFailed; | |
} | |
} | |
pub fn setupPeformanceCounters() !void { | |
_ = c.pthread_set_qos_class_self_np(c.QOS_CLASS_USER_INTERACTIVE, 0); | |
try initRdtsc(); | |
} | |
const PerformanceCounters = struct { | |
cycles: u64, | |
branches: u64, | |
missed_branches: u64, | |
instructions: u64, | |
}; | |
pub fn getCounters() !PerformanceCounters { | |
var api = kpc_api.api; | |
if (api.kpc_get_thread_counters(0, CountersCount, @ptrToInt(&kpc_counters)) != 0) { | |
return error.KpcGetThreadCountersError; | |
} | |
return PerformanceCounters{ | |
.cycles = kpc_counters[0 + 2], | |
.branches = kpc_counters[3 + 2], | |
.missed_branches = kpc_counters[4 + 2], | |
.instructions = kpc_counters[5 + 2], | |
}; | |
} | |
const stdout = std.io.getStdOut().writer(); | |
pub fn main() !void { | |
try setupPeformanceCounters(); | |
var counters = try getCounters(); | |
try stdout.print("Before: {}\n", .{counters}); | |
var random = std.rand.DefaultPrng.init(0).random(); | |
var s: u64 = 0; | |
var i: usize = 0; | |
while (i < 100_000) : (i += 1) { | |
s += random.intRangeAtMost(u64, 0, 10000); | |
} | |
var next = try getCounters(); | |
try stdout.print("After: {}\n", .{next}); | |
var delta = next.cycles - counters.cycles; | |
try stdout.print("{} cyles, {d:.2} cycles/iteration\n", .{ delta, @intToFloat(f32, delta) / 100_000.0 }); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Just a note: you must run this with
sudo
to work!