Skip to content

Instantly share code, notes, and snippets.

@kprotty
Created October 26, 2022 02:31
Show Gist options
  • Save kprotty/9f45dde0eaea94a9a8d13097ee44b3cf to your computer and use it in GitHub Desktop.
Save kprotty/9f45dde0eaea94a9a8d13097ee44b3cf to your computer and use it in GitHub Desktop.
const std = @import("std");
const builtin = @import("builtin");
const Random = std.rand.Random;
const Atomic = std.atomic.Atomic;
pub fn main() !void {
try bench(struct {
pub const name = "FAA";
pub fn inc(counter: *Atomic(usize), current: usize, rng: *Random) usize {
_ = rng;
_ = current;
return counter.fetchAdd(1, .Monotonic) + 1;
}
});
try bench(struct {
pub const name = "CAS";
pub fn inc(counter: *Atomic(usize), current: usize, rng: *Random) usize {
_ = rng;
var value = current;
while (true) {
const new_value = value + 1;
value = counter.tryCompareAndSwap(
value,
new_value,
.Monotonic,
.Monotonic,
) orelse return new_value;
}
}
});
try bench(struct {
pub const name = "CAS backoff";
pub fn inc(counter: *Atomic(usize), current: usize, rng: *Random) usize {
var value = current;
while (true) {
const new_value = value + 1;
value = counter.compareAndSwap(
value,
new_value,
.Monotonic,
.Monotonic,
) orelse return new_value;
// reload the value after spinning
defer value = counter.load(.Monotonic);
// On M1, it's better to spin with a single WFE instead
if (comptime builtin.target.os.tag.isDarwin() and builtin.target.cpu.arch == .aarch64) {
asm volatile("wfe" ::: "memory");
continue;
}
// Spin a randomized amount of time to avoid threads resonating
// at the same frequency and causing permanent contention.
// https://github.com/apple/swift-corelibs-libdispatch/blob/main/src/shims/yield.h#L102-L125
const spin_max = 128;
const spin_min = 32;
var spin = (rng.int(u32) & (spin_max - 1)) | (spin_min - 1);
while (spin > 0) : (spin -= 1) std.atomic.spinLoopHint();
}
}
});
}
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
fn bench(comptime Config: type) !void {
var arena = std.heap.ArenaAllocator.init(gpa.allocator());
defer arena.deinit();
const allocator = arena.allocator();
const num_cpus = try std.Thread.getCpuCount();
const Shared = extern struct {
front_padding: [std.atomic.cache_line]u8 = undefined,
counter: Atomic(usize) = Atomic(usize).init(0),
back_padding: [std.atomic.cache_line]u8 = undefined,
};
const Worker = struct {
shutdown: Atomic(bool) = Atomic(bool).init(false),
thread: std.Thread = undefined,
increments: usize = 0,
shared: *Shared,
fn run(worker: *@This(), seed: usize) void {
var local_inc: usize = 0;
defer worker.increments = local_inc;
var rng = std.rand.DefaultPrng.init(seed + 1);
var random = rng.random();
const shared = worker.shared;
var current = shared.counter.load(.Monotonic);
while (!worker.shutdown.load(.Monotonic)) {
current = Config.inc(&shared.counter, current, &random);
local_inc += 1;
}
}
};
const workers = try allocator.alloc(Worker, num_cpus);
defer allocator.free(workers);
var shared = Shared{};
for (workers) |*w, i| {
w.* = .{ .shared = &shared };
w.thread = try std.Thread.spawn(.{}, Worker.run, .{w, i});
}
const measure_duration_ns = 1000 * std.time.ns_per_ms;
std.time.sleep(measure_duration_ns);
for (workers) |*w| w.shutdown.store(true, .Monotonic);
for (workers) |w| w.thread.join();
var min: i128 = std.math.maxInt(i128);
var max: i128 = 0;
var sum: i128 = 0;
for (workers) |w| {
sum += w.increments;
min = std.math.min(min, w.increments);
max = std.math.max(max, w.increments);
}
const avg = @divFloor(sum, workers.len);
const throughput_ms = @divFloor(sum, measure_duration_ns / std.time.ns_per_ms);
var stdev: i128 = 0;
for (workers) |w| stdev += (w.increments - avg) * 2;
stdev = @divFloor(stdev, workers.len);
std.debug.print(
\\-------------------------
\\| {s}
\\-------------------------
\\| {s:>10} | {s:>8} |
\\| {d:>7}/ms | {d:>8} |
\\
\\
, .{
Config.name,
"throughput", "stdev",
@intCast(u128, throughput_ms),
@intCast(u128, stdev),
});
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment