-
-
Save s3bk/352be8b3af719b51830470a0860a3381 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
[package] | |
name = "fast" | |
version = "0.1.0" | |
authors = ["Sebastian K <s3bk@protonmail.com>"] | |
edition = "2018" | |
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html | |
[dependencies] | |
rayon = "*" | |
fast-floats = "*" | |
num_cpus = "*" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#![feature(core_intrinsics)] | |
use std::time::{Instant}; | |
use rayon::prelude::*; // is provided for parallel iterators | |
use fast_floats::{Fast, FF64}; | |
// this function can't be changed (may only add attributes) | |
#[inline] | |
fn expm1(x: FF64) -> FF64 { | |
return ((((((((((((((15.0 + x) * x + 210.0) * x + 2730.0) * x + 32760.0) * x + 360360.0) * x + 3603600.0) * x + 32432400.0) * x + 259459200.0) * x + 1816214400.0) * x + 10897286400.0) * x + 54486432000.0) * x + 217945728000.0) * x + 653837184000.0) * x + 1307674368000.0) * x * 7.6471637318198164759011319857881e-13; | |
} | |
const M256: usize = !0xFF; | |
fn chunked_par(data: &mut [FF64], func: impl FnMut(&mut FF64) + Copy + Sync + Send) { | |
let data_ptr = data.as_ptr() as usize; | |
let head_ptr = (data_ptr + 255) & M256; | |
let tail_ptr = (data_ptr + data.len() * 8) & M256; | |
if tail_ptr <= head_ptr { | |
data.iter_mut().for_each(func); | |
return; | |
} | |
let head_n = (head_ptr - data_ptr) / 8; | |
let tail_n = (tail_ptr - data_ptr) / 8; | |
data[0 .. head_n].iter_mut().for_each(func); | |
let chunk_n = tail_n - head_n; | |
let chunk_tail_n = head_n + chunk_n - chunk_n % 1024; | |
debug_assert_eq!((chunk_tail_n - head_n) % 1024, 0); | |
data[head_n .. chunk_tail_n].par_chunks_exact_mut(1024).for_each(|s| { | |
debug_assert_eq!(s.len(), 1024); | |
debug_assert_eq!(s.as_ptr() as usize % 256, 0); | |
let chunk: &mut [FF64; 1024] = unsafe { | |
std::intrinsics::assume(s.as_ptr() as usize % 256 == 0); | |
&mut *(s.as_mut_ptr() as *mut [FF64; 1024]) | |
}; | |
chunk.iter_mut().for_each(func); | |
}); | |
data[chunk_tail_n .. tail_n].iter_mut().for_each(func); | |
data[tail_n ..].iter_mut().for_each(func); | |
} | |
// this function can't be changed (may only add attributes) | |
fn twelve(x: FF64) -> FF64 { | |
return expm1( expm1( expm1( expm1( expm1( expm1( expm1( expm1( expm1( expm1( expm1( expm1(x)))))))))))); | |
} | |
// you might want to optimize this one as well | |
fn populate(data: &mut [FF64]) { | |
chunked_par(data, |d| *d = Fast(0.1)); | |
} | |
// optimize this one | |
fn apply(data: &mut [FF64]) { | |
chunked_par(data, |d| *d=twelve(*d)); | |
} | |
// you might want to optimize this one as well | |
fn verify(data: &[FF64], n: usize) { | |
for i in 0..n { | |
let expected = twelve(Fast(0.1)); | |
if expected != data[i] { | |
println!("error at {:?} - {:?} != {:?}", i, data[i], expected); | |
} | |
} | |
} | |
fn run(data: &mut [FF64], n: usize) -> f64 { | |
populate(data); | |
let start = Instant::now(); | |
apply(data); | |
let duration = (start.elapsed().as_millis() as f64) * 1.0E-3; | |
verify(data, n); | |
let gflops = (n as f64) * 12.0 * 15.0E-9; | |
println!("{:?}", gflops / duration); | |
return gflops / duration; | |
} | |
fn main() { | |
println!("avx2: {:?}", is_x86_feature_detected!("avx2")); | |
let cpu: usize = num_cpus::get(); | |
println!("num cores {:?}", cpu); | |
let n: usize = 4 * 256 * 1024; // take n as large as possible | |
let mut input = vec![Fast(0.1); n]; | |
let mut best : f64 = 0.0; | |
for _i in 0..10 { | |
let gflop = run(&mut input, n); | |
if gflop > best { | |
best = gflop; | |
} | |
} | |
println!("Metric : {:?} GFlop/s", best); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment