Skip to content

Instantly share code, notes, and snippets.

@josephg
Created June 24, 2024 03:19
Show Gist options
  • Save josephg/46355f27b00e3006c538659ecdf525fb to your computer and use it in GitHub Desktop.
Save josephg/46355f27b00e3006c538659ecdf525fb to your computer and use it in GitHub Desktop.
Benchmark
use std::hint::black_box;
use criterion::Criterion;
use rand::{RngCore, SeedableRng};
use rand::rngs::StdRng;
#[no_mangle]
#[inline]
pub fn lzcnt(n: u64) -> u32
{
n.leading_zeros()
}
#[no_mangle]
#[inline]
pub fn lzcnt_branchless(n: u64) -> u32 {
if cfg!(all(target_arch = "x86_64", not(target_feature="lzcnt"))) {
(n | 1).leading_zeros() + (n == 0) as u32
} else {
n.leading_zeros()
}
}
// Generates the same ASM as above.
// #[no_mangle]
// #[inline]
// pub fn lzcnt_branchless_2(n: u64) -> u32 {
// let non_zero = n | 1;
//
// let lzcnt = non_zero.leading_zeros();
//
// lzcnt + (n == 0) as u32
// }
fn gen_numbers(n: usize, seed: u64) -> Vec<u64> {
let mut rng = StdRng::seed_from_u64(seed);
(0..n).map(|_| {
let n = rng.next_u32() as u64;
n * n
}).collect()
}
fn main() {
let mut c = Criterion::default()
.configure_from_args();
let nums = gen_numbers(100_000, 123123);
c.bench_function("lz", |b| {
b.iter(|| {
let sum: u32 = nums.iter().copied().map(lzcnt).sum();
black_box(sum);
});
});
c.bench_function("lz_branchless", |b| {
b.iter(|| {
let sum: u32 = nums.iter().copied().map(lzcnt_branchless).sum();
black_box(sum);
});
});
c.final_summary();
}
$ cargo run --release -- --bench
Compiling leading_zero_bench v0.1.0 (/home/seph/temp/leading_zero_bench)
Finished `release` profile [optimized] target(s) in 0.51s
Running `target/release/leading_zero_bench --bench`
lz time: [87.799 µs 87.983 µs 88.223 µs]
lz_branchless time: [27.554 µs 27.581 µs 27.611 µs]
@josephg
Copy link
Author

josephg commented Jun 24, 2024

And the asm for both versions:

lzcnt:
	.cfi_startproc
	test rdi, rdi
	je .LBB275_1
	bsr rax, rdi
	xor rax, 63
	ret
.LBB275_1:
	mov eax, 64
	ret

lzcnt_branchless:
	.cfi_startproc
	mov rax, rdi
	or rax, 1
	bsr rax, rax
	xor eax, 63
	cmp rdi, 1
	adc eax, 0
	ret

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment