Last active
January 23, 2022 10:29
-
-
Save zommiommy/1e70a039a7ed8ea323995e1f88a6c500 to your computer and use it in GitHub Desktop.
Benching of AVX dish vs relu
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#![feature(portable_simd)] | |
#![feature(test)] | |
extern crate test; | |
use test::{black_box, Bencher}; | |
use std::convert::TryInto; | |
use std::simd::*; | |
use std::arch::x86_64::*; | |
type Reg = f32x8; | |
/// compute the activation function | |
#[inline(always)] | |
fn dish(x: &Reg) -> Reg { | |
let ones = Reg::splat(1.0); | |
let half = Reg::splat(0.5); | |
let fma = ones + x * x; | |
half * x * vec_sqrt(ones + (x / (fma))) | |
} | |
#[inline(always)] | |
fn vec_sqrt(x: Reg) -> Reg { | |
unsafe { | |
let res = _mm256_rsqrt_ps( | |
*((&x as *const Reg) as *const __m256) | |
); | |
*((&res as *const __m256) as *const Reg) | |
} | |
} | |
#[inline(always)] | |
fn relu(x: &Reg) -> Reg { | |
let zeros = Reg::splat(0.0); | |
x.max(zeros) | |
} | |
fn get_test_values() -> Vec<Reg> { | |
const MAX: usize = 1_000_000; | |
let res = (0..MAX).map(|x| x as f32 - (MAX / 2) as f32).collect::<Vec<f32>>(); | |
res.chunks(8).map(|x| { | |
let array: [f32; 8] = x.try_into().unwrap(); | |
array.into() | |
}).collect::<Vec<Reg>>() | |
} | |
#[bench] | |
fn bench_dish(b: &mut Bencher) { | |
let values = black_box(get_test_values()); | |
b.bytes = (values.len() * std::mem::size_of::<Reg>()) as _; | |
b.iter(|| { | |
values.iter() | |
.map(dish) | |
.collect::<Vec<Reg>>() | |
}); | |
} | |
#[bench] | |
fn bench_relu(b: &mut Bencher) { | |
let values = black_box(get_test_values()); | |
b.bytes = (values.len() * std::mem::size_of::<Reg>()) as _; | |
b.iter(|| { | |
values.iter() | |
.map(relu) | |
.collect::<Vec<Reg>>() | |
}); | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
On my Ryzen 3900x | |
test bench_dish ... bench: 1,424,578 ns/iter (+/- 10,866) = 2807 MB/s | |
test bench_relu ... bench: 95,921 ns/iter (+/- 11,679) = 41700 MB/s |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment