Navigation Menu

Skip to content

Instantly share code, notes, and snippets.

@thomcc
Created June 1, 2018 23:31
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save thomcc/a6a0abf52de03e853dd410d744e4605b to your computer and use it in GitHub Desktop.
Save thomcc/a6a0abf52de03e853dd410d744e4605b to your computer and use it in GitHub Desktop.
#![feature(test)]
#![recursion_limit = "1024"]
#![allow(dead_code)]
#![feature(stdsimd)]
extern crate test;
extern crate rand;
// Defines a little benchmarking DSL macro. If you've been linked here, it's safe to ignore.
#[macro_use]
mod bench_helpers {
use super::*;
pub struct BenchGroupItem<F>(usize, u64, F);
impl<F: Fn(usize, &mut test::Bencher)> BenchGroupItem<F> {
pub fn new(size: usize, tsz: usize, runner: F) -> Self {
BenchGroupItem(size, (size as u64) * (tsz as u64), runner)
}
}
impl<F: Fn(usize, &mut test::Bencher) + Send> test::TDynBenchFn for BenchGroupItem<F> {
fn run(&self, b: &mut test::Bencher) {
b.bytes = self.1;
(self.2)(self.0, b);
}
}
pub fn static_bench_desc(name: &'static str, f: fn(&mut test::Bencher)) -> test::TestDescAndFn {
test::TestDescAndFn {
desc: test::TestDesc {
name: test::StaticTestName(name),
ignore: false,
allow_fail: false,
should_panic: test::ShouldPanic::No,
},
testfn: test::TestFn::StaticBenchFn(f)
}
}
pub fn dyn_bench_desc(name: String, boxed: ::std::boxed::Box<test::TDynBenchFn + 'static>) -> test::TestDescAndFn {
test::TestDescAndFn {
desc: test::TestDesc {
name: test::DynTestName(name),
ignore: false,
allow_fail: false,
should_panic: test::ShouldPanic::No,
},
testfn: test::TestFn::DynBenchFn(boxed)
}
}
#[macro_export]
macro_rules! benchmarks {
(@bench_group($benches:ident) $name:ident, $tsz:expr, $sizes:expr, $sn:ident, {$($setup:tt)*}, {$(;)*}) => {};
(@bench_group($benches:ident) $name:ident, $_ts:expr, $_sz:expr, $sn:ident, {$($setup:tt)*}, {
sizes[$t:ty] = $sizes:expr; $($rest:tt)*
}) => {
benchmarks!(@bench_group($benches) $name, ::std::mem::size_of::<$t>(), $sizes, $sn, {$($setup)*}, { $($rest)* });
};
(@bench_group($benches:ident) $name:ident, $tsz:expr, $sizes:expr, $sn:ident, {$($_setup:tt)*}, {
setup {$($setup:tt)*}; $($rest:tt)*
}) => {
benchmarks!(@bench_group($benches) $name, $tsz, $sizes, $sn, {$($setup)*}, { $($rest)* });
};
(@bench_group($benches:ident) $name:ident, $tsz:expr, $sizes:expr, $_sn:ident, {$($_setup:tt)*}, {
setup($size_name:ident) {$($setup:tt)*}; $($rest:tt)*
}) => {
benchmarks!(@bench_group($benches) $name, $tsz, $sizes, $size_name, {$($setup)*}, { $($rest)* });
};
(@bench_group($benches:ident) $name:ident, $tsz:expr, $sizes:expr, $szname:ident, {$($setup:tt)*}, {
variant $var_name:ident { $($vbody:tt)* };
$($rest:tt)*
}) => {
benchmarks!(@bench_group($benches) $name, $tsz, $sizes, $szname, {$($setup)*}, {
cond_variant(true) $var_name { $($vbody)* };
$($rest)*
});
};
(@bench_group($benches:ident) $name:ident, $tsz:expr, $sizes:expr, $szname:ident, {$($setup:tt)*}, {
cond_variant($cond:expr) $var_name:ident { $($vbody:tt)* };
$($rest:tt)*
}) => {
if $cond {
fn $var_name($szname: usize, bencher: &mut test::Bencher) {
$($setup)*;
bencher.iter(|| { $($vbody)* });
}
for &size in $sizes.iter() {
$benches.push($crate::bench_helpers::dyn_bench_desc(
format!("{}_{} for size = {}", stringify!($name), stringify!($var_name), size),
::std::boxed::Box::new($crate::bench_helpers::BenchGroupItem::new(size, $tsz, $var_name))
));
}
} else {
println!("Benchmark {}_{} skipped because the test `{}` failed",
stringify!($name), stringify!($var_name),
stringify!($cond));
}
benchmarks!(@bench_group($benches) $name, $tsz, $sizes, $szname, {$($setup)*}, { $($rest)* });
};
(@inner($_benches:ident)) => {};
(@inner($_benches:ident) ;) => {};
(@inner($benches:ident) group $name:ident { $($inner:tt)+ }; $($rest:tt)*) => {
{ benchmarks!(@bench_group($benches) $name, 0, [0], _size, {}, { $($inner)+ }); };
benchmarks!(@inner($benches) $($rest)*);
};
(@inner($benches:ident) bench $name:ident { $e:expr }; $($rest:tt)*) => {
fn $name(bencher_: &mut test::Bencher) { bencher_.iter(|| $e); }
$benches.push($crate::bench_helpers::static_bench_desc(stringify!($name), $name));
benchmarks!(@inner($benches) $($rest)*);
};
(@inner($benches:ident) bench $name:ident ($id:ident) { $($body:tt)* }; $($rest:tt)*) => {
fn $name($id : &mut test::Bencher) { $($body)* }
$benches.push($crate::bench_helpers::static_bench_desc(stringify!($name), $name));
benchmarks!(@inner($benches) $($rest)*);
};
(@inner($benches:ident) fn $name:ident ($($args:tt)*) { $($body:tt)* }; $($rest:tt)*) => {
fn $name($($args)*) { $($body)* }
$benches.push($crate::bench_helpers::static_bench_desc(stringify!($name), $name));
benchmarks!(@inner($benches) $($rest)*);
};
($($benchmarks:tt)*) => {
fn get_benchmarks() -> Vec<test::TestDescAndFn> {
let mut benches = vec![];
benchmarks!(@inner(benches) $($benchmarks)*);
benches
}
};
}
}
fn aligned_f32_vec(capacity: usize) -> Vec<f32> {
use std::mem;
#[repr(C)]
#[repr(align(32))]
struct Aligned([f32; 8]);
assert_eq!(mem::size_of::<Aligned>(), 32);
assert!(mem::align_of::<Aligned>() >= 32,
"alignment was {}", mem::align_of::<Aligned>());
let mut v: Vec<Aligned> = Vec::with_capacity(1 + capacity / 8);
let p = v.as_mut_ptr();
let cap = v.capacity();
mem::forget(v);
let floats = unsafe { Vec::from_raw_parts(p as *mut f32, 0, cap * 8) };
assert_eq!((floats.as_ptr() as usize) & 31, 0);
floats
}
fn rand_vec(sz: usize) -> Vec<f32> {
let mut res = aligned_f32_vec(sz);
for _ in 0..sz {
res.push(rand::random::<f32>() * 2.0 - 1.0);
}
assert_eq!((res.as_ptr() as usize) & 31, 0);
res
}
#[no_mangle]
#[inline(never)]
pub fn dot_fold_iter(a: &[f32], b: &[f32]) -> f32 {
assert_eq!(a.len(), b.len());
a.iter().zip(b.iter()).fold(0.0, |v, (x, y)| v + (x * y))
}
#[no_mangle]
#[inline(never)]
pub fn dot_iter(a: &[f32], b: &[f32]) -> f32 {
assert_eq!(a.len(), b.len());
let mut result = 0.0;
for (x, y) in a.iter().zip(b.iter()) {
result += x * y;
}
result
}
#[no_mangle]
#[inline(never)]
pub fn dot_slices_unchecked(a: &[f32], b: &[f32]) -> f32 {
assert_eq!(a.len(), b.len());
let mut result = 0.0;
let mut i = 0;
let len = a.len();
while i < len {
unsafe {
result += (*a.get_unchecked(i)) * (*b.get_unchecked(i));
}
i += 1;
}
result
}
#[no_mangle]
#[inline(never)]
pub fn dot_slices_checked(a: &[f32], b: &[f32]) -> f32 {
assert_eq!(a.len(), b.len());
let mut result = 0.0;
let mut i = 0;
let len = a.len();
while i < len {
result += a[i] * b[i];
i += 1;
}
result
}
#[no_mangle]
#[inline(never)]
pub fn dot_ptr(a: &[f32], b: &[f32]) -> f32 {
assert_eq!(a.len(), b.len());
let mut result = 0.0;
unsafe {
let mut pa = a.as_ptr();
let mut pb = b.as_ptr();
let end = pa.offset(a.len() as isize);
while pa < end {
result += (*pa) * (*pb);
pa = pa.offset(1);
pb = pb.offset(1);
}
}
result
}
#[no_mangle]
#[inline(never)]
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
#[target_feature(enable = "sse")]
pub unsafe fn dot_sse(a: &[f32], b: &[f32]) -> f32 {
#[cfg(target_arch = "x86")] use std::arch::x86::*;
#[cfg(target_arch = "x86_64")] use std::arch::x86_64::*;
assert_eq!(a.len(), b.len());
let mut p_a = a.as_ptr();
let mut p_b = b.as_ptr();
assert_eq!((p_a as usize) & 15, 0, "a is not 16 byte aligned...");
assert_eq!((p_b as usize) & 15, 0, "a is not 16 byte aligned...");
let p_trail_end = p_a.offset(a.len() as isize);
let p_vec_end = p_a.offset((a.len() & !3) as isize);
let mut sums = _mm_setzero_ps();
while p_a < p_vec_end {
let x = _mm_load_ps(p_a);
let y = _mm_load_ps(p_b);
// sums = _mm_fmadd(x, y, sums);
sums = _mm_add_ps(_mm_mul_ps(x, y), sums);
p_a = p_a.offset(4);
p_b = p_b.offset(4);
}
let main_result = {
let mut sum_data = [0.0f32; 4];
_mm_store_ps(sum_data.as_mut_ptr(), sums);
sum_data[0] + sum_data[1] + sum_data[2] + sum_data[3]
};
let mut trail_result = 0.0;
while p_a < p_trail_end {
trail_result += (*p_a) * (*p_b);
p_a = p_a.offset(1);
p_b = p_b.offset(1);
}
main_result + trail_result
}
#[no_mangle]
#[inline(never)]
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
#[target_feature(enable = "avx")]
pub unsafe fn dot_avx(a: &[f32], b: &[f32]) -> f32 {
#[cfg(target_arch = "x86")] use std::arch::x86::*;
#[cfg(target_arch = "x86_64")] use std::arch::x86_64::*;
assert_eq!(a.len(), b.len());
let mut p_a = a.as_ptr();
let mut p_b = b.as_ptr();
assert_eq!((p_a as usize) & 31, 0, "a is not 32 byte aligned...");
assert_eq!((p_b as usize) & 31, 0, "b is not 32 byte aligned...");
let p_trail_end = p_a.offset(a.len() as isize);
let p_vec_end = p_a.offset((a.len() & !7) as isize);
let mut sums = _mm256_setzero_ps();
while p_a < p_vec_end {
let x = _mm256_load_ps(p_a);
let y = _mm256_load_ps(p_b);
// sums = _mm256_fmadd(x, y, sums);
sums = _mm256_add_ps(_mm256_mul_ps(x, y), sums);
p_a = p_a.offset(8);
p_b = p_b.offset(8);
}
let main_result = {
let flipped = _mm256_permute2f128_ps(sums, sums, 1);
let mut partial = _mm256_add_ps(flipped, sums);
partial = _mm256_hadd_ps(partial, partial);
partial = _mm256_hadd_ps(partial, partial);
_mm256_cvtss_f32(partial)
};
let mut trail_result = 0.0;
while p_a < p_trail_end {
trail_result += (*p_a) * (*p_b);
p_a = p_a.offset(1);
p_b = p_b.offset(1);
}
main_result + trail_result
}
use test::black_box;
benchmarks! {
group dot {
sizes[f32] = [/*3, 9, 64, 100, 256, */512, 519];
setup(size) {
let a = rand_vec(size);
let b = rand_vec(size);
};
variant ptr {
black_box(dot_ptr(black_box(&a), black_box(&b)))
};
variant slice_unchecked {
black_box(dot_slices_unchecked(black_box(&a), black_box(&b)))
};
variant slices_checked {
black_box(dot_slices_checked(black_box(&a), black_box(&b)))
};
variant iter {
black_box(dot_iter(black_box(&a), black_box(&b)))
};
variant fold_iter {
black_box(dot_fold_iter(black_box(&a), black_box(&b)))
};
cond_variant(is_x86_feature_detected!("sse")) simd_sse {
black_box(unsafe { dot_sse(black_box(&a), black_box(&b)) })
};
cond_variant(is_x86_feature_detected!("avx")) simd_avx {
black_box(unsafe { dot_avx(black_box(&a), black_box(&b)) })
};
};
}
fn print_feats() {
let arch = if cfg!(target_arch = "x86") { "x86" }
else if cfg!(target_arch = "x86_64") { "x86_64" }
else { println!("not intel!"); return; };
println!("Running on: {}", arch);
// println!("aes: {}", is_x86_feature_detected!("aes"));
// println!("pclmulqdq: {}", is_x86_feature_detected!("pclmulqdq"));
// println!("rdrand: {}", is_x86_feature_detected!("rdrand"));
// println!("rdseed: {}", is_x86_feature_detected!("rdseed"));
// println!("tsc: {}", is_x86_feature_detected!("tsc"));
println!("mmx: {}", is_x86_feature_detected!("mmx"));
println!("sse: {}", is_x86_feature_detected!("sse"));
println!("sse2: {}", is_x86_feature_detected!("sse2"));
println!("sse3: {}", is_x86_feature_detected!("sse3"));
println!("ssse3: {}", is_x86_feature_detected!("ssse3"));
println!("sse4.1: {}", is_x86_feature_detected!("sse4.1"));
println!("sse4.2: {}", is_x86_feature_detected!("sse4.2"));
println!("sse4a: {}", is_x86_feature_detected!("sse4a"));
// println!("sha: {}", is_x86_feature_detected!("sha"));
println!("avx: {}", is_x86_feature_detected!("avx"));
println!("avx2: {}", is_x86_feature_detected!("avx2"));
// println!("avx512f: {}", is_x86_feature_detected!("avx512f"));
// println!("avx512cd: {}", is_x86_feature_detected!("avx512cd"));
// println!("avx512er: {}", is_x86_feature_detected!("avx512er"));
// println!("avx512pf: {}", is_x86_feature_detected!("avx512pf"));
// println!("avx512bw: {}", is_x86_feature_detected!("avx512bw"));
// println!("avx512dq: {}", is_x86_feature_detected!("avx512dq"));
// println!("avx512vl: {}", is_x86_feature_detected!("avx512vl"));
// println!("avx512ifma: {}", is_x86_feature_detected!("avx512ifma"));
// println!("avx512vbmi: {}", is_x86_feature_detected!("avx512vbmi"));
// println!("avx512vpopcntdq: {}", is_x86_feature_detected!("avx512vpopcntdq"));
println!("fma: {}", is_x86_feature_detected!("fma"));
// println!("bmi1: {}", is_x86_feature_detected!("bmi1"));
// println!("bmi2: {}", is_x86_feature_detected!("bmi2"));
// println!("abm: {}", is_x86_feature_detected!("abm"));
// println!("lzcnt: {}", is_x86_feature_detected!("lzcnt"));
// println!("tbm: {}", is_x86_feature_detected!("tbm"));
// println!("popcnt: {}", is_x86_feature_detected!("popcnt"));
// println!("fxsr: {}", is_x86_feature_detected!("fxsr"));
// println!("xsave: {}", is_x86_feature_detected!("xsave"));
// println!("xsaveopt: {}", is_x86_feature_detected!("xsaveopt"));
// println!("xsaves: {}", is_x86_feature_detected!("xsaves"));
// println!("xsavec: {}", is_x86_feature_detected!("xsavec"));
}
fn main() {
let test_opts = test::TestOpts {
bench_benchmarks: true,
color: test::NeverColor,
// defaults...
list: false,
filter: None,
filter_exact: false,
run_ignored: false,
run_tests: true,
logfile: None,
nocapture: false,
format: test::OutputFormat::Pretty,
test_threads: None,
skip: vec![],
options: test::Options::new(),
};
print_feats();
test::run_tests_console(&test_opts, get_benchmarks()).unwrap();
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment