Skip to content

Instantly share code, notes, and snippets.

@notcancername
Last active January 7, 2024 03:41
Show Gist options
  • Save notcancername/d63e27235d6ee59d9e68bb12470d668d to your computer and use it in GitHub Desktop.
Save notcancername/d63e27235d6ee59d9e68bb12470d668d to your computer and use it in GitHub Desktop.
Zig add vs add inline vs @addWithOverflow
const std = @import("std");
extern fn not_inlined(ptr: [*]usize, len: usize) usize;
extern fn inlined(ptr: [*]usize, len: usize) usize;
extern fn builtin(ptr: [*]usize, len: usize) usize;
extern fn builtin_no_destructure(ptr: [*]usize, len: usize) usize;
extern fn sum_not_inlined(ptr: [*]usize, len: usize) usize;
extern fn sum_inlined(ptr: [*]usize, len: usize) usize;
extern fn sum_builtin(ptr: [*]usize, len: usize) usize;
extern fn sum_builtin_no_destructure(ptr: [*]usize, len: usize) usize;
pub fn xmain() !void {
const datap: [*]usize = @ptrCast(try std.os.mmap(null, (100 << 20) * @sizeOf(usize), std.os.PROT.READ, std.os.MAP.ANONYMOUS | std.os.MAP.PRIVATE, -1, 0));
const data = datap[0..100 << 20];
const r = switch(std.os.argv[1][0]) {
'n' => not_inlined(data.ptr, data.len),
'i' => inlined(data.ptr, data.len),
'b' => builtin(data.ptr, data.len),
'd' => builtin_no_destructure(data.ptr, data.len),
'a' => sum_not_inlined(data.ptr, data.len),
'x' => sum_inlined(data.ptr, data.len),
'c' => sum_builtin(data.ptr, data.len),
'e' => sum_builtin_no_destructure(data.ptr, data.len),
else => 1,
};
if(r != 0) return error.Shit;
}
pub fn main() u8 {
return if(xmain()) 0 else |_| 1;
}
#! /bin/sh
zig build-obj -fno-lto -O ReleaseFast -march=skylake test_add_inline.zig &&
zig build-exe -fstrip -fno-lto -O ReleaseFast -marck=skylake bench_add_inline.zig test_add_inline.o
not_inlined:
xor eax, eax
test rsi, rsi
je .LBB0_5
xor ecx, ecx
.LBB0_2:
mov rdx, qword ptr [rdi + 8*rcx]
add rdx, rdx
setb r8b
setb byte ptr [rsp - 16]
test r8b, r8b
jne .LBB0_7
add rax, rdx
setb dl
setb byte ptr [rsp - 8]
test dl, dl
jne .LBB0_7
inc rcx
cmp rsi, rcx
jne .LBB0_2
.LBB0_5:
ret
.LBB0_7:
mov rax, -1
ret
inlined:
test rsi, rsi
je .LBB1_1
mov rdx, qword ptr [rdi]
add rdx, rdx
setb cl
setb byte ptr [rsp - 16]
mov rax, -1
test cl, cl
jne .LBB1_2
mov ecx, 1
xor r8d, r8d
.LBB1_6:
mov r9, r8
mov r8, rdx
add r8, r9
setb dl
setb byte ptr [rsp - 8]
test dl, dl
jne .LBB1_2
cmp rsi, rcx
je .LBB1_8
mov rdx, qword ptr [rdi + 8*rcx]
add rdx, rdx
setb r9b
setb byte ptr [rsp - 16]
inc rcx
test r9b, r9b
je .LBB1_6
.LBB1_2:
ret
.LBB1_1:
xor eax, eax
ret
.LBB1_8:
mov rax, r8
ret
builtin:
test rsi, rsi
je .LBB2_1
xor ecx, ecx
xor edx, edx
.LBB2_4:
mov r8, qword ptr [rdi + 8*rdx]
add r8, r8
setb byte ptr [rsp - 16]
setb r9b
setb byte ptr [rsp - 2]
mov rax, -1
test r9b, r9b
jne .LBB2_2
add rcx, r8
setb byte ptr [rsp - 8]
setb r8b
setb byte ptr [rsp - 1]
test r8b, r8b
jne .LBB2_2
inc rdx
mov rax, rcx
cmp rsi, rdx
jne .LBB2_4
.LBB2_2:
ret
.LBB2_1:
xor eax, eax
ret
builtin_no_destructure:
test rsi, rsi
je .LBB3_1
xor ecx, ecx
xor edx, edx
.LBB3_4:
mov r8, qword ptr [rdi + 8*rdx]
add r8, r8
setb r9b
setb byte ptr [rsp - 16]
mov rax, -1
test r9b, r9b
jne .LBB3_2
add rcx, r8
setb r8b
setb byte ptr [rsp - 8]
test r8b, r8b
jne .LBB3_2
inc rdx
mov rax, rcx
cmp rsi, rdx
jne .LBB3_4
.LBB3_2:
ret
.LBB3_1:
xor eax, eax
ret
sum_not_inlined:
xor eax, eax
test rsi, rsi
je .LBB4_5
xor ecx, ecx
.LBB4_2:
add rax, qword ptr [rdi + 8*rcx]
setb dl
setb byte ptr [rsp - 8]
test dl, dl
jne .LBB4_3
inc rcx
cmp rsi, rcx
jne .LBB4_2
.LBB4_5:
ret
.LBB4_3:
mov rax, -1
ret
sum_inlined:
test rsi, rsi
je .LBB5_1
mov rcx, qword ptr [rdi]
mov byte ptr [rsp - 8], 0
mov rax, -1
xor edx, edx
test dl, dl
jne .LBB5_2
mov edx, 1
.LBB5_6:
cmp rsi, rdx
je .LBB5_7
add rcx, qword ptr [rdi + 8*rdx]
setb r8b
setb byte ptr [rsp - 8]
inc rdx
test r8b, r8b
je .LBB5_6
.LBB5_2:
ret
.LBB5_1:
xor eax, eax
ret
.LBB5_7:
mov rax, rcx
ret
sum_builtin:
xor eax, eax
test rsi, rsi
je .LBB6_5
xor ecx, ecx
.LBB6_2:
add rax, qword ptr [rdi + 8*rcx]
setb byte ptr [rsp - 8]
setb dl
setb byte ptr [rsp - 1]
test dl, dl
jne .LBB6_3
inc rcx
cmp rsi, rcx
jne .LBB6_2
.LBB6_5:
ret
.LBB6_3:
mov rax, -1
ret
sum_builtin_no_destructure:
test rsi, rsi
je .LBB7_1
xor edx, edx
xor ecx, ecx
.LBB7_4:
mov rax, qword ptr [rdi + 8*rcx]
add rax, rdx
setb r8b
setb byte ptr [rsp - 8]
test r8b, r8b
jne .LBB7_5
add rax, rdx
inc rcx
mov rdx, rax
cmp rsi, rcx
jne .LBB7_4
ret
.LBB7_1:
xor eax, eax
ret
.LBB7_5:
mov rax, -1
ret
#! /bin/sh
poop -d 15000 --color never './bench_add_inline n' './bench_add_inline i' './bench_add_inline b' './bench_add_inline d'
poop -d 15000 --color never './bench_add_inline a' './bench_add_inline x' './bench_add_inline c' './bench_add_inline e'
Benchmark 1 (96 runs): ./bench_add_inline n
measurement mean ± σ min … max outliers delta
wall_time 157ms ± 15.6ms 133ms … 189ms 0 ( 0%) 0%
peak_rss 131KB ± 0 131KB … 131KB 0 ( 0%) 0%
cpu_cycles 444M ± 43.9M 381M … 537M 0 ( 0%) 0%
instructions 1.47G ± 9.04 1.47G … 1.47G 4 ( 4%) 0%
cache_references 26.7M ± 403K 26.0M … 27.3M 0 ( 0%) 0%
cache_misses 1.44M ± 557K 535K … 4.01M 1 ( 1%) 0%
branch_misses 254 ± 273 22 … 1.35K 4 ( 4%) 0%
Benchmark 2 (88 runs): ./bench_add_inline i
measurement mean ± σ min … max outliers delta
wall_time 171ms ± 15.8ms 142ms … 203ms 0 ( 0%) 💩+ 8.4% ± 2.9%
peak_rss 131KB ± 0 131KB … 131KB 0 ( 0%) - 0.0% ± 0.0%
cpu_cycles 481M ± 43.4M 409M … 572M 0 ( 0%) 💩+ 8.3% ± 2.8%
instructions 1.68G ± 11.4 1.68G … 1.68G 2 ( 2%) 💩+ 14.3% ± 0.0%
cache_references 26.9M ± 319K 26.3M … 27.3M 0 ( 0%) + 0.8% ± 0.4%
cache_misses 1.84M ± 1.33M 695K … 8.36M 8 ( 9%) 💩+ 28.2% ± 20.2%
branch_misses 200 ± 230 25 … 1.01K 2 ( 2%) - 21.5% ± 28.8%
Benchmark 3 (68 runs): ./bench_add_inline b
measurement mean ± σ min … max outliers delta
wall_time 221ms ± 24.0ms 181ms … 284ms 0 ( 0%) 💩+ 40.4% ± 3.9%
peak_rss 131KB ± 0 131KB … 131KB 0 ( 0%) - 0.0% ± 0.0%
cpu_cycles 602M ± 50.0M 521M … 708M 0 ( 0%) 💩+ 35.6% ± 3.3%
instructions 1.89G ± 11.9 1.89G … 1.89G 0 ( 0%) 💩+ 28.6% ± 0.0%
cache_references 26.9M ± 456K 26.3M … 27.7M 0 ( 0%) + 0.6% ± 0.5%
cache_misses 2.45M ± 1.88M 1.14M … 12.6M 8 (12%) 💩+ 70.7% ± 27.8%
branch_misses 201 ± 208 29 … 1.13K 2 ( 3%) - 21.0% ± 30.3%
Benchmark 4 (88 runs): ./bench_add_inline d
measurement mean ± σ min … max outliers delta
wall_time 172ms ± 15.8ms 142ms … 210ms 0 ( 0%) 💩+ 9.2% ± 2.9%
peak_rss 131KB ± 0 131KB … 131KB 0 ( 0%) - 0.0% ± 0.0%
cpu_cycles 479M ± 43.7M 410M … 576M 0 ( 0%) 💩+ 7.9% ± 2.9%
instructions 1.68G ± 9.05 1.68G … 1.68G 0 ( 0%) 💩+ 14.3% ± 0.0%
cache_references 26.8M ± 361K 26.3M … 27.4M 0 ( 0%) + 0.3% ± 0.4%
cache_misses 1.53M ± 526K 658K … 3.88M 1 ( 1%) + 6.7% ± 10.9%
branch_misses 576 ± 495 26 … 2.43K 2 ( 2%) 💩+126.3% ± 44.9%
Benchmark 1 (156 runs): ./bench_add_inline a
measurement mean ± σ min … max outliers delta
wall_time 96.4ms ± 14.9ms 79.0ms … 139ms 1 ( 1%) 0%
peak_rss 131KB ± 0 131KB … 131KB 0 ( 0%) 0%
cpu_cycles 268M ± 41.7M 221M … 385M 0 ( 0%) 0%
instructions 839M ± 69.9 839M … 839M 11 ( 7%) 0%
cache_references 26.3M ± 321K 25.8M … 26.7M 0 ( 0%) 0%
cache_misses 1.07M ± 928K 127K … 8.18M 3 ( 2%) 0%
branch_misses 264 ± 339 16 … 1.88K 10 ( 6%) 0%
Benchmark 2 (159 runs): ./bench_add_inline x
measurement mean ± σ min … max outliers delta
wall_time 93.9ms ± 12.5ms 78.4ms … 126ms 0 ( 0%) - 2.5% ± 3.2%
peak_rss 131KB ± 0 131KB … 131KB 0 ( 0%) - 0.0% ± 0.0%
cpu_cycles 259M ± 34.1M 220M … 344M 0 ( 0%) - 3.4% ± 3.1%
instructions 839M ± 6.61 839M … 839M 4 ( 3%) + 0.0% ± 0.0%
cache_references 26.1M ± 329K 25.7M … 26.7M 0 ( 0%) - 0.5% ± 0.3%
cache_misses 1.19M ± 1.09M 96.7K … 7.24M 9 ( 6%) + 10.6% ± 20.9%
branch_misses 157 ± 224 16 … 1.00K 27 (17%) ⚡- 40.6% ± 24.0%
Benchmark 3 (112 runs): ./bench_add_inline c
measurement mean ± σ min … max outliers delta
wall_time 133ms ± 24.5ms 98.7ms … 205ms 0 ( 0%) 💩+ 38.3% ± 4.9%
peak_rss 131KB ± 0 131KB … 131KB 0 ( 0%) - 0.0% ± 0.0%
cpu_cycles 343M ± 48.0M 279M … 479M 5 ( 4%) 💩+ 28.1% ± 4.0%
instructions 944M ± 29.6 944M … 944M 15 (13%) 💩+ 12.5% ± 0.0%
cache_references 26.5M ± 376K 25.8M … 27.4M 0 ( 0%) + 0.8% ± 0.3%
cache_misses 3.57M ± 4.00M 192K … 17.1M 13 (12%) 💩+233.2% ± 60.7%
branch_misses 289 ± 313 22 … 1.81K 4 ( 4%) + 9.5% ± 30.2%
Benchmark 4 (129 runs): ./bench_add_inline e
measurement mean ± σ min … max outliers delta
wall_time 117ms ± 14.9ms 97.5ms … 170ms 1 ( 1%) 💩+ 21.0% ± 3.6%
peak_rss 131KB ± 0 131KB … 131KB 0 ( 0%) - 0.0% ± 0.0%
cpu_cycles 319M ± 39.7M 274M … 461M 1 ( 1%) 💩+ 19.1% ± 3.6%
instructions 1.15G ± 32.0 1.15G … 1.15G 0 ( 0%) 💩+ 37.5% ± 0.0%
cache_references 26.2M ± 376K 25.7M … 26.9M 0 ( 0%) - 0.3% ± 0.3%
cache_misses 1.35M ± 1.14M 207K … 8.85M 5 ( 4%) 💩+ 25.6% ± 22.3%
branch_misses 414 ± 420 22 … 2.47K 3 ( 2%) 💩+ 56.9% ± 33.4%
const std = @import("std");
fn add(comptime T: type, a: T, b: T) (error{Overflow}!T) {
if (T == comptime_int) return a + b;
const ov = @addWithOverflow(a, b);
if (ov[1] != 0) return error.Overflow;
return ov[0];
}
inline fn addI(comptime T: type, a: T, b: T) (error{Overflow}!T) {
if (T == comptime_int) return a + b;
const ov = @addWithOverflow(a, b);
if (ov[1] != 0) return error.Overflow;
return ov[0];
}
inline fn sumOfSquares(ptr: [*]usize, len: usize) !usize {
var accum: usize = 0;
for(ptr[0..len]) |i| accum = try add(usize, accum, try add(usize, i, i));
return accum;
}
inline fn sumOfSquaresInlined(ptr: [*]usize, len: usize) !usize {
var accum: usize = 0;
for(ptr[0..len]) |i| accum = try addI(usize, accum, try addI(usize, i, i));
return accum;
}
inline fn sumOfSquaresBuiltin(ptr: [*]usize, len: usize) usize {
var accum: usize = 0;
for(ptr[0..len]) |i| {
const i_squared, const i_sq_overflowed = @addWithOverflow(i, i);
if(i_sq_overflowed != 0) return std.math.maxInt(usize);
accum, const accum_overflowed = @addWithOverflow(accum, i_squared);
if(accum_overflowed != 0) return std.math.maxInt(usize);
}
return accum;
}
inline fn sumOfSquaresBuiltinNoDestructure(ptr: [*]usize, len: usize) usize {
var accum: usize = 0;
for(ptr[0..len]) |i| {
const ov1 = @addWithOverflow(i, i);
if(ov1[1] != 0) return std.math.maxInt(usize);
const ov2 = @addWithOverflow(accum, ov1[0]);
if(ov2[1] != 0) return std.math.maxInt(usize);
accum = ov2[0];
}
return accum;
}
export fn not_inlined(ptr: [*]usize, len: usize) usize {
return sumOfSquares(ptr, len) catch std.math.maxInt(usize);
}
export fn inlined(ptr: [*]usize, len: usize) usize {
return sumOfSquaresInlined(ptr, len) catch std.math.maxInt(usize);
}
export fn builtin(ptr: [*]usize, len: usize) usize {
return sumOfSquaresBuiltin(ptr, len);
}
export fn builtin_no_destructure(ptr: [*]usize, len: usize) usize {
return sumOfSquaresBuiltinNoDestructure(ptr, len);
}
inline fn sum(ptr: [*]usize, len: usize) !usize {
var accum: usize = 0;
for(ptr[0..len]) |i| accum = try add(usize, accum, i);
return accum;
}
inline fn sumInlined(ptr: [*]usize, len: usize) !usize {
var accum: usize = 0;
for(ptr[0..len]) |i| accum = try addI(usize, accum, i);
return accum;
}
inline fn sumBuiltin(ptr: [*]usize, len: usize) usize {
var accum: usize = 0;
for(ptr[0..len]) |i| {
accum, const accum_overflowed = @addWithOverflow(accum, i);
if(accum_overflowed != 0) return std.math.maxInt(usize);
}
return accum;
}
inline fn sumBuiltinNoDestructure(ptr: [*]usize, len: usize) usize {
var accum: usize = 0;
for(ptr[0..len]) |i| {
const ov = @addWithOverflow(accum, i);
if(ov[1] != 0) return std.math.maxInt(usize);
accum += ov[0];
}
return accum;
}
export fn sum_not_inlined(ptr: [*]usize, len: usize) usize {
return sum(ptr, len) catch std.math.maxInt(usize);
}
export fn sum_inlined(ptr: [*]usize, len: usize) usize {
return sumInlined(ptr, len) catch std.math.maxInt(usize);
}
export fn sum_builtin(ptr: [*]usize, len: usize) usize {
return sumBuiltin(ptr, len);
}
export fn sum_builtin_no_destructure(ptr: [*]usize, len: usize) usize {
return sumBuiltinNoDestructure(ptr, len);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment