Skip to content

Instantly share code, notes, and snippets.

@s3bk

s3bk/Cargo.toml Secret

Created December 30, 2020 19:04
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save s3bk/352be8b3af719b51830470a0860a3381 to your computer and use it in GitHub Desktop.
Save s3bk/352be8b3af719b51830470a0860a3381 to your computer and use it in GitHub Desktop.
[package]
name = "fast"
version = "0.1.0"
authors = ["Sebastian K <s3bk@protonmail.com>"]
edition = "2018"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
rayon = "*"
fast-floats = "*"
num_cpus = "*"
#![feature(core_intrinsics)]
use std::time::{Instant};
use rayon::prelude::*; // is provided for parallel iterators
use fast_floats::{Fast, FF64};
// this function can't be changed (may only add attributes)
#[inline]
fn expm1(x: FF64) -> FF64 {
return ((((((((((((((15.0 + x) * x + 210.0) * x + 2730.0) * x + 32760.0) * x + 360360.0) * x + 3603600.0) * x + 32432400.0) * x + 259459200.0) * x + 1816214400.0) * x + 10897286400.0) * x + 54486432000.0) * x + 217945728000.0) * x + 653837184000.0) * x + 1307674368000.0) * x * 7.6471637318198164759011319857881e-13;
}
const M256: usize = !0xFF;
fn chunked_par(data: &mut [FF64], func: impl FnMut(&mut FF64) + Copy + Sync + Send) {
let data_ptr = data.as_ptr() as usize;
let head_ptr = (data_ptr + 255) & M256;
let tail_ptr = (data_ptr + data.len() * 8) & M256;
if tail_ptr <= head_ptr {
data.iter_mut().for_each(func);
return;
}
let head_n = (head_ptr - data_ptr) / 8;
let tail_n = (tail_ptr - data_ptr) / 8;
data[0 .. head_n].iter_mut().for_each(func);
let chunk_n = tail_n - head_n;
let chunk_tail_n = head_n + chunk_n - chunk_n % 1024;
debug_assert_eq!((chunk_tail_n - head_n) % 1024, 0);
data[head_n .. chunk_tail_n].par_chunks_exact_mut(1024).for_each(|s| {
debug_assert_eq!(s.len(), 1024);
debug_assert_eq!(s.as_ptr() as usize % 256, 0);
let chunk: &mut [FF64; 1024] = unsafe {
std::intrinsics::assume(s.as_ptr() as usize % 256 == 0);
&mut *(s.as_mut_ptr() as *mut [FF64; 1024])
};
chunk.iter_mut().for_each(func);
});
data[chunk_tail_n .. tail_n].iter_mut().for_each(func);
data[tail_n ..].iter_mut().for_each(func);
}
// this function can't be changed (may only add attributes)
fn twelve(x: FF64) -> FF64 {
return expm1( expm1( expm1( expm1( expm1( expm1( expm1( expm1( expm1( expm1( expm1( expm1(x))))))))))));
}
// you might want to optimize this one as well
fn populate(data: &mut [FF64]) {
chunked_par(data, |d| *d = Fast(0.1));
}
// optimize this one
fn apply(data: &mut [FF64]) {
chunked_par(data, |d| *d=twelve(*d));
}
// you might want to optimize this one as well
fn verify(data: &[FF64], n: usize) {
for i in 0..n {
let expected = twelve(Fast(0.1));
if expected != data[i] {
println!("error at {:?} - {:?} != {:?}", i, data[i], expected);
}
}
}
fn run(data: &mut [FF64], n: usize) -> f64 {
populate(data);
let start = Instant::now();
apply(data);
let duration = (start.elapsed().as_millis() as f64) * 1.0E-3;
verify(data, n);
let gflops = (n as f64) * 12.0 * 15.0E-9;
println!("{:?}", gflops / duration);
return gflops / duration;
}
fn main() {
println!("avx2: {:?}", is_x86_feature_detected!("avx2"));
let cpu: usize = num_cpus::get();
println!("num cores {:?}", cpu);
let n: usize = 4 * 256 * 1024; // take n as large as possible
let mut input = vec![Fast(0.1); n];
let mut best : f64 = 0.0;
for _i in 0..10 {
let gflop = run(&mut input, n);
if gflop > best {
best = gflop;
}
}
println!("Metric : {:?} GFlop/s", best);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment