Skip to content

Instantly share code, notes, and snippets.

@matu3ba
Last active June 22, 2024 14:43
Show Gist options
  • Save matu3ba/eb52a8132cd4fc39b1836275341b8ce5 to your computer and use it in GitHub Desktop.
Save matu3ba/eb52a8132cd4fc39b1836275341b8ce5 to your computer and use it in GitHub Desktop.
benchmarks for muloti
Benchmark 1: ./mulo_fast
Time (mean ± σ): 1.008 s ± 0.009 s [User: 1.006 s, System: 0.001 s]
Range (min … max): 0.996 s … 1.027 s 10 runs
Warning: The first benchmarking run for this command was significantly slower than the
rest (1.027 s). This could be caused by (filesystem) caches that were not filled until af
ter the first run. You should consider using the '--warmup' option to fill those caches b
efore the actual benchmark. Alternatively, use the '--prepare' option to clear the caches
before each timing run.
Benchmark 2: ./mulo_hack
Time (mean ± σ): 1.256 s ± 0.004 s [User: 1.250 s, System: 0.001 s]
Range (min … max): 1.248 s … 1.260 s 10 runs
Benchmark 3: ./mulo_llvm
Time (mean ± σ): 17.403 s ± 0.155 s [User: 17.363 s, System: 0.005 s]
Range (min … max): 17.314 s … 17.836 s 10 runs
Summary
'./mulo_fast' ran
1.25 ± 0.01 times faster than './mulo_hack'
17.26 ± 0.22 times faster than './mulo_llvm'
With loop size of 1_000_000_000
Benchmark 1: ./mulo_fast
Time (mean ± σ): 21.826 s ± 1.182 s [User: 21.791 s, System: 0.001 s]
Range (min … max): 20.844 s … 24.191 s 10 runs
Benchmark 2: ./mulo_hack
⠙ Current estimate: 26.246 s ██████████████░░░░░░░░░░░░░░░░░░░░░░ ETA 00:02:39
const std = @import("std");
const builtin = @import("builtin");
const math = std.math;
inline fn muloXi4_generic(comptime ST: type, a: ST, b: ST, overflow: *c_int) ST {
@setRuntimeSafety(builtin.is_test);
const BSIZE = @bitSizeOf(ST);
comptime var UT = switch (ST) {
i32 => u32,
i64 => u64,
i128 => u128,
else => unreachable,
};
const min = @bitCast(ST, @as(UT, 1 << (BSIZE - 1)));
const max = ~min;
overflow.* = 0;
const result = a *% b;
// edge cases
if (a == min) {
if (b != 0 and b != 1) overflow.* = 1;
return result;
}
if (b == min) {
if (a != 0 and a != 1) overflow.* = 1;
return result;
}
// take sign of x sx
const sa = a >> (BSIZE - 1);
const sb = b >> (BSIZE - 1);
// take absolute value of a and b via
// abs(x) = (x^sx)) - sx
const abs_a = (a ^ sa) -% sa;
const abs_b = (b ^ sb) -% sb;
// unitary magnitude, cannot have overflow
if (abs_a < 2 or abs_b < 2) return result;
// compare the signs of operands
if ((a ^ b) >> (BSIZE - 1) != 0) {
if (abs_a > @divTrunc(max, abs_b)) overflow.* = 1;
} else {
if (abs_a > @divTrunc(min, -abs_b)) overflow.* = 1;
}
return result;
}
fn __muloti4(a: i128, b: i128, overflow: *c_int) callconv(.C) i128 {
return muloXi4_generic(i128, a, b, overflow);
}
inline fn mulvXi_genericSmall(comptime ST: type, a: ST, b: ST, overflow: *c_int) ST {
@setRuntimeSafety(builtin.is_test);
const min = math.minInt(ST);
var res: ST = a *% b;
// Hacker's Delight section Overflow subsection Multiplication
// case a=-2^{31}, b=-1 problem, because
// on some machines a*b = -2^{31} with overflow
// Then -2^{31}/-1 overflows and any result is possible.
// => check with a<0 and b=-2^{31}
if ((a < 0 and b == min) or (a != 0 and @divTrunc(res, a) != b))
overflow.* = 1;
return @truncate(ST, res);
}
// adjusted mulv
fn __mulvti3(a: i128, b: i128, overflow: *c_int) callconv(.C) i128 {
return mulvXi_genericSmall(i128, a, b, overflow);
}
inline fn mulvXi_genericFast(comptime ST: type, a: ST, b: ST, overflow: *c_int) ST {
@setRuntimeSafety(builtin.is_test);
overflow.* = 0;
const EST = switch (ST) {
i32 => i64,
i64 => i128,
i128 => i256,
else => unreachable,
};
const min = math.minInt(ST);
const max = math.maxInt(ST);
var res: EST = @as(EST, a) * @as(EST, b);
//invariant: -2^{bitwidth(EST)} <= res <= 2^{bitwidth(EST)-1}
if (res < min or max < res)
overflow.* = 1;
return @truncate(ST, res);
}
fn __mulvti3fast(a: i128, b: i128, overflow: *c_int) callconv(.C) i128 {
return mulvXi_genericFast(i128, a, b, overflow);
}
pub fn main() !void {
var x: i128 = 0;
var y: i128 = 0;
var ov: c_int = 0;
var res: i128 = 0;
var sum: i128 = 0;
var sum2: i128 = 0;
const stdout = std.io.getStdOut();
stdout.writeAll("starting\n") catch unreachable;
while (x < 50_000_000) {
//res = __muloti4(x, y, &ov);
//res = __mulvti3(x, y, &ov);
res = __mulvti3fast(x, y, &ov);
x += 1;
y += 1;
sum += res;
if (sum > 1_000_000) {
sum2 += 1;
sum = 0;
}
//std.debug.assert(ov != 1);
}
if (ov == 1) stdout.writeAll("error: overflow happened\n") catch unreachable;
//std.debug.print("sum2: {d}\n", .{sum2});
if (sum2 > 0) stdout.writeAll("finished\n") catch unreachable;
std.process.exit(0);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment