raphlinus/simd sine generation tests

## simd sine generation tests
#![feature(test)]

extern crate test;

// Note: this dependency is nightly only
extern crate packed_simd;

#[macro_use]
extern crate cfg_if;

#[cfg(target_arch = "x86_64")]
use std::arch::x86_64::*;

use std::str::FromStr;

use packed_simd::{f32x4, f32x8, IntoBits};

trait SimdRounding {
    fn round(self) -> Self;
}

cfg_if! {
    if #[cfg(all(target_arch = "x86_64", target_feature = "sse4"))] {
        impl SimdRounding for f32x4 {
            #[inline]
            fn round(self) -> Self {
                unsafe {
                    _mm_round_ps(self.into_bits(), 8).into_bits()
                }
            }
        }
    } else {
        impl SimdRounding for f32x4 {
            fn round(self) -> Self {
                // TODO: provide fallback
                unimplemented!();
            }
        }
    }
}

cfg_if! {
    if #[cfg(all(target_arch = "x86_64", target_feature = "avx"))] {
        impl SimdRounding for f32x8 {
            #[inline(always)]
            fn round(self) -> Self {
                unsafe {
                    _mm256_round_ps(self.into_bits(), 8).into_bits()
                }
            }
        }
    } else {
        impl SimdRounding for f32x8 {
            #[inline(always)]
            fn round(self) -> Self {
                f32x8::new(self.extract(0).round(),
                    self.extract(1).round(),
                    self.extract(2).round(),
                    self.extract(3).round(),
                    self.extract(4).round(),
                    self.extract(5).round(),
                    self.extract(6).round(),
                    self.extract(7).round())
            }
        }
    }
}

trait Steps {
    fn steps() -> Self;
}

impl Steps for f32x4 {
    fn steps() -> Self {
        f32x4::new(0.0, 1.0, 2.0, 3.0)
    }
}

impl Steps for f32x8 {
    fn steps() -> Self {
        f32x8::new(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0)
    }
}

fn quadwave_scalar(freq: f32, obuf: &mut [f32]) {
    let mut phase = 0.0f32;
    for out in obuf {
        let y = phase - phase.round();
        let y = 16.0 * y * (0.5 - y.abs());
        *out = y;
        phase += freq;
    }
}

fn sin_scalar(freq: f32, obuf: &mut [f32]) {
    let mut phase = 0.0f32;
    let twopi = 2.0 * ::std::f32::consts::PI;
    for out in obuf {
        *out = (phase * twopi).sin();
        phase += freq;
        phase -= phase.floor();
    }
}

#[target_feature(enable = "avx")]
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
unsafe fn _mm256_abs(x: __m256) -> __m256 {
    _mm256_andnot_ps(_mm256_set1_ps(-0.0), x)
}

#[target_feature(enable = "avx")]
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
unsafe fn quadwave_avx(freq: f32, obuf: &mut [f32]) {
    assert!(obuf.len() % 8 == 0);
    let mut i = 0;
    let mut phase = _mm256_mul_ps(_mm256_setr_ps(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0),
        _mm256_set1_ps(freq));
    let phaseinc = _mm256_set1_ps(8.0 * freq);
    loop {
        if i == obuf.len() { break; }
        let y = _mm256_sub_ps(phase, _mm256_round_ps(phase, 8));
        let y = _mm256_mul_ps(y, _mm256_sub_ps(_mm256_set1_ps(0.5),
            _mm256_abs(y)));
        let y = _mm256_mul_ps(y, _mm256_set1_ps(16.0));
        _mm256_storeu_ps(obuf.as_mut_ptr().add(i), y);
        phase = _mm256_add_ps(phase, phaseinc);
        phase = _mm256_sub_ps(phase, _mm256_floor_ps(phase));
        i += 8;
    }
}

#[target_feature(enable = "avx")]
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
unsafe fn sin7_avx(freq: f32, obuf: &mut [f32]) {
    assert!(obuf.len() % 8 == 0);
    let mut i = 0;
    let mut phase = _mm256_mul_ps(_mm256_setr_ps(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0),
        _mm256_set1_ps(freq));
    phase = _mm256_add_ps(phase, _mm256_set1_ps(0.25));
    let phaseinc = _mm256_set1_ps(8.0 * freq);
    loop {
        if i == obuf.len() { break; }
        let a = _mm256_sub_ps(phase, _mm256_round_ps(phase, 8));
        let a = _mm256_sub_ps(_mm256_abs(a), _mm256_set1_ps(0.25));
        let a2 = _mm256_mul_ps(a, a);
        let y = _mm256_mul_ps(a2, _mm256_set1_ps(-57.09913892));
        let y = _mm256_mul_ps(a2, _mm256_add_ps(y, _mm256_set1_ps(78.3211512)));
        let y = _mm256_mul_ps(a2, _mm256_add_ps(y, _mm256_set1_ps(-41.13568647)));
        let y = _mm256_mul_ps(a, _mm256_add_ps(y, _mm256_set1_ps(6.27971764)));
        _mm256_storeu_ps(obuf.as_mut_ptr().add(i), y);
        phase = _mm256_add_ps(phase, phaseinc);
        phase = _mm256_sub_ps(phase, _mm256_floor_ps(phase));
        i += 8;
    }
}

#[target_feature(enable = "avx")]
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
unsafe fn sin9_avx(freq: f32, obuf: &mut [f32]) {
    assert!(obuf.len() % 8 == 0);
    let mut i = 0;
    let mut phase = _mm256_mul_ps(_mm256_setr_ps(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0),
        _mm256_set1_ps(freq));
    phase = _mm256_add_ps(phase, _mm256_set1_ps(0.25));
    let phaseinc = _mm256_set1_ps(8.0 * freq);
    while i != obuf.len() {
        if i == obuf.len() { break; }
        let a = _mm256_sub_ps(phase, _mm256_round_ps(phase, 8));
        let a = _mm256_sub_ps(_mm256_abs(a), _mm256_set1_ps(0.25));
        let a2 = _mm256_mul_ps(a, a);
        // coefs:
        // basis = linspace(-.5, .5, 1000)
        // print polyfit(basis, sin(2*pi*basis), 9)[-2::-2]
        let y = _mm256_mul_ps(a2, _mm256_set1_ps(33.15324345));
        let y = _mm256_mul_ps(a2, _mm256_add_ps(y, _mm256_set1_ps(-74.66884436)));
        let y = _mm256_mul_ps(a2, _mm256_add_ps(y, _mm256_set1_ps(81.39900205)));
        let y = _mm256_mul_ps(a2, _mm256_add_ps(y, _mm256_set1_ps(-41.33318707)));
        let y = _mm256_mul_ps(a, _mm256_add_ps(y, _mm256_set1_ps(6.28308759)));
        _mm256_storeu_ps(obuf.as_mut_ptr().add(i), y);
        phase = _mm256_add_ps(phase, phaseinc);
        phase = _mm256_sub_ps(phase, _mm256_floor_ps(phase));
        i += 8;
    }
}

#[target_feature(enable = "avx")]
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
unsafe fn packed_sin9(freq: f32, obuf: &mut [f32]) {
    let c0 =   6.28308759;
    let c1 = -41.33318707;
    let c2 =  81.39900205;
    let c3 = -74.66884436;
    let c4 =  33.15324345;

    assert!(obuf.len() % 8 == 0);
    let mut phase = f32x8::steps() * freq;
    phase += 0.25;
    let mut i = 0;
    while i != obuf.len() {
        let a = (phase - phase.round()).abs() - 0.25;
        let a2 = a * a;
        let y = a * (c0 + a2 * (c1 + a2 * (c2 + a2 * (c3 + a2 * c4))));
        y.write_to_slice_unaligned(&mut obuf[i..i + 8]);
        phase += freq * 8.0;
        i += 8;
    }
}

fn main() {
    let freq = f32::from_str(&::std::env::args().skip(1).next().unwrap()).unwrap();
    //let freq = 0.1;
    let mut obuf = [0.0f32; 32];
    unsafe { sin9_avx(freq, &mut obuf); }
    let mut obuf2 = [0.0f32; 32];
    unsafe { packed_sin9(freq, &mut obuf2); }
    for (x, y) in obuf.iter().zip(obuf2.iter()) {
        println!("{} {}", x, y);
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use test::Bencher;

    #[bench]
    fn bench_quadwave_scalar(b: &mut Bencher) {
        let mut obuf = [0.0f32; 64];
        let freq = 0.1;
        b.iter(|| quadwave_scalar(test::black_box(freq), &mut obuf));
    }

    #[bench]
    fn bench_quadwave_avx(b: &mut Bencher) {
        let mut obuf = [0.0f32; 64];
        let freq = 0.1;
        unsafe {
            b.iter(|| quadwave_avx(test::black_box(freq), &mut obuf));
        }
    }

    #[bench]
    fn bench_sin_scalar(b: &mut Bencher) {
        let mut obuf = [0.0f32; 64];
        let freq = 0.1;
        b.iter(|| sin_scalar(test::black_box(freq), &mut obuf));
    }

    #[bench]
    fn bench_sin7_avx(b: &mut Bencher) {
        let mut obuf = [0.0f32; 64];
        let freq = 0.1;
        unsafe {
            b.iter(|| sin7_avx(test::black_box(freq), &mut obuf));
        }
    }

    #[bench]
    fn bench_sin9_avx(b: &mut Bencher) {
        let mut obuf = [0.0f32; 64];
        let freq = 0.1;
        unsafe {
            b.iter(|| sin9_avx(test::black_box(freq), &mut obuf));
        }
    }

    #[bench]
    fn bench_packed_sin9(b: &mut Bencher) {
        let mut obuf = [0.0f32; 64];
        let freq = 0.1;
        unsafe {
            b.iter(|| packed_sin9(test::black_box(freq), &mut obuf));
        }
    }
}
	#![feature(test)]

	extern crate test;

	// Note: this dependency is nightly only
	extern crate packed_simd;

	#[macro_use]
	extern crate cfg_if;

	#[cfg(target_arch = "x86_64")]
	use std::arch::x86_64::*;

	use std::str::FromStr;

	use packed_simd::{f32x4, f32x8, IntoBits};

	trait SimdRounding {
	fn round(self) -> Self;
	}

	cfg_if! {
	if #[cfg(all(target_arch = "x86_64", target_feature = "sse4"))] {
	impl SimdRounding for f32x4 {
	#[inline]
	fn round(self) -> Self {
	unsafe {
	_mm_round_ps(self.into_bits(), 8).into_bits()
	}
	}
	}
	} else {
	impl SimdRounding for f32x4 {
	fn round(self) -> Self {
	// TODO: provide fallback
	unimplemented!();
	}
	}
	}
	}

	cfg_if! {
	if #[cfg(all(target_arch = "x86_64", target_feature = "avx"))] {
	impl SimdRounding for f32x8 {
	#[inline(always)]
	fn round(self) -> Self {
	unsafe {
	_mm256_round_ps(self.into_bits(), 8).into_bits()
	}
	}
	}
	} else {
	impl SimdRounding for f32x8 {
	#[inline(always)]
	fn round(self) -> Self {
	f32x8::new(self.extract(0).round(),
	self.extract(1).round(),
	self.extract(2).round(),
	self.extract(3).round(),
	self.extract(4).round(),
	self.extract(5).round(),
	self.extract(6).round(),
	self.extract(7).round())
	}
	}
	}
	}

	trait Steps {
	fn steps() -> Self;
	}

	impl Steps for f32x4 {
	fn steps() -> Self {
	f32x4::new(0.0, 1.0, 2.0, 3.0)
	}
	}

	impl Steps for f32x8 {
	fn steps() -> Self {
	f32x8::new(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0)
	}
	}

	fn quadwave_scalar(freq: f32, obuf: &mut [f32]) {
	let mut phase = 0.0f32;
	for out in obuf {
	let y = phase - phase.round();
	let y = 16.0 * y * (0.5 - y.abs());
	*out = y;
	phase += freq;
	}
	}

	fn sin_scalar(freq: f32, obuf: &mut [f32]) {
	let mut phase = 0.0f32;
	let twopi = 2.0 * ::std::f32::consts::PI;
	for out in obuf {
	out = (phase twopi).sin();
	phase += freq;
	phase -= phase.floor();
	}
	}

	#[target_feature(enable = "avx")]
	#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
	unsafe fn _mm256_abs(x: __m256) -> __m256 {
	_mm256_andnot_ps(_mm256_set1_ps(-0.0), x)
	}

	#[target_feature(enable = "avx")]
	#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
	unsafe fn quadwave_avx(freq: f32, obuf: &mut [f32]) {
	assert!(obuf.len() % 8 == 0);
	let mut i = 0;
	let mut phase = _mm256_mul_ps(_mm256_setr_ps(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0),
	_mm256_set1_ps(freq));
	let phaseinc = _mm256_set1_ps(8.0 * freq);
	loop {
	if i == obuf.len() { break; }
	let y = _mm256_sub_ps(phase, _mm256_round_ps(phase, 8));
	let y = _mm256_mul_ps(y, _mm256_sub_ps(_mm256_set1_ps(0.5),
	_mm256_abs(y)));
	let y = _mm256_mul_ps(y, _mm256_set1_ps(16.0));
	_mm256_storeu_ps(obuf.as_mut_ptr().add(i), y);
	phase = _mm256_add_ps(phase, phaseinc);
	phase = _mm256_sub_ps(phase, _mm256_floor_ps(phase));
	i += 8;
	}
	}

	#[target_feature(enable = "avx")]
	#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
	unsafe fn sin7_avx(freq: f32, obuf: &mut [f32]) {
	assert!(obuf.len() % 8 == 0);
	let mut i = 0;
	let mut phase = _mm256_mul_ps(_mm256_setr_ps(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0),
	_mm256_set1_ps(freq));
	phase = _mm256_add_ps(phase, _mm256_set1_ps(0.25));
	let phaseinc = _mm256_set1_ps(8.0 * freq);
	loop {
	if i == obuf.len() { break; }
	let a = _mm256_sub_ps(phase, _mm256_round_ps(phase, 8));
	let a = _mm256_sub_ps(_mm256_abs(a), _mm256_set1_ps(0.25));
	let a2 = _mm256_mul_ps(a, a);
	let y = _mm256_mul_ps(a2, _mm256_set1_ps(-57.09913892));
	let y = _mm256_mul_ps(a2, _mm256_add_ps(y, _mm256_set1_ps(78.3211512)));
	let y = _mm256_mul_ps(a2, _mm256_add_ps(y, _mm256_set1_ps(-41.13568647)));
	let y = _mm256_mul_ps(a, _mm256_add_ps(y, _mm256_set1_ps(6.27971764)));
	_mm256_storeu_ps(obuf.as_mut_ptr().add(i), y);
	phase = _mm256_add_ps(phase, phaseinc);
	phase = _mm256_sub_ps(phase, _mm256_floor_ps(phase));
	i += 8;
	}
	}

	#[target_feature(enable = "avx")]
	#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
	unsafe fn sin9_avx(freq: f32, obuf: &mut [f32]) {
	assert!(obuf.len() % 8 == 0);
	let mut i = 0;
	let mut phase = _mm256_mul_ps(_mm256_setr_ps(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0),
	_mm256_set1_ps(freq));
	phase = _mm256_add_ps(phase, _mm256_set1_ps(0.25));
	let phaseinc = _mm256_set1_ps(8.0 * freq);
	while i != obuf.len() {
	if i == obuf.len() { break; }
	let a = _mm256_sub_ps(phase, _mm256_round_ps(phase, 8));
	let a = _mm256_sub_ps(_mm256_abs(a), _mm256_set1_ps(0.25));
	let a2 = _mm256_mul_ps(a, a);
	// coefs:
	// basis = linspace(-.5, .5, 1000)
	// print polyfit(basis, sin(2pibasis), 9)[-2::-2]
	let y = _mm256_mul_ps(a2, _mm256_set1_ps(33.15324345));
	let y = _mm256_mul_ps(a2, _mm256_add_ps(y, _mm256_set1_ps(-74.66884436)));
	let y = _mm256_mul_ps(a2, _mm256_add_ps(y, _mm256_set1_ps(81.39900205)));
	let y = _mm256_mul_ps(a2, _mm256_add_ps(y, _mm256_set1_ps(-41.33318707)));
	let y = _mm256_mul_ps(a, _mm256_add_ps(y, _mm256_set1_ps(6.28308759)));
	_mm256_storeu_ps(obuf.as_mut_ptr().add(i), y);
	phase = _mm256_add_ps(phase, phaseinc);
	phase = _mm256_sub_ps(phase, _mm256_floor_ps(phase));
	i += 8;
	}
	}

	#[target_feature(enable = "avx")]
	#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
	unsafe fn packed_sin9(freq: f32, obuf: &mut [f32]) {
	let c0 = 6.28308759;
	let c1 = -41.33318707;
	let c2 = 81.39900205;
	let c3 = -74.66884436;
	let c4 = 33.15324345;

	assert!(obuf.len() % 8 == 0);
	let mut phase = f32x8::steps() * freq;
	phase += 0.25;
	let mut i = 0;
	while i != obuf.len() {
	let a = (phase - phase.round()).abs() - 0.25;
	let a2 = a * a;
	let y = a * (c0 + a2 * (c1 + a2 * (c2 + a2 * (c3 + a2 * c4))));
	y.write_to_slice_unaligned(&mut obuf[i..i + 8]);
	phase += freq * 8.0;
	i += 8;
	}
	}

	fn main() {
	let freq = f32::from_str(&::std::env::args().skip(1).next().unwrap()).unwrap();
	//let freq = 0.1;
	let mut obuf = [0.0f32; 32];
	unsafe { sin9_avx(freq, &mut obuf); }
	let mut obuf2 = [0.0f32; 32];
	unsafe { packed_sin9(freq, &mut obuf2); }
	for (x, y) in obuf.iter().zip(obuf2.iter()) {
	println!("{} {}", x, y);
	}
	}

	#[cfg(test)]
	mod tests {
	use super::*;
	use test::Bencher;

	#[bench]
	fn bench_quadwave_scalar(b: &mut Bencher) {
	let mut obuf = [0.0f32; 64];
	let freq = 0.1;
	b.iter(\|\| quadwave_scalar(test::black_box(freq), &mut obuf));
	}

	#[bench]
	fn bench_quadwave_avx(b: &mut Bencher) {
	let mut obuf = [0.0f32; 64];
	let freq = 0.1;
	unsafe {
	b.iter(\|\| quadwave_avx(test::black_box(freq), &mut obuf));
	}
	}

	#[bench]
	fn bench_sin_scalar(b: &mut Bencher) {
	let mut obuf = [0.0f32; 64];
	let freq = 0.1;
	b.iter(\|\| sin_scalar(test::black_box(freq), &mut obuf));
	}

	#[bench]
	fn bench_sin7_avx(b: &mut Bencher) {
	let mut obuf = [0.0f32; 64];
	let freq = 0.1;
	unsafe {
	b.iter(\|\| sin7_avx(test::black_box(freq), &mut obuf));
	}
	}

	#[bench]
	fn bench_sin9_avx(b: &mut Bencher) {
	let mut obuf = [0.0f32; 64];
	let freq = 0.1;
	unsafe {
	b.iter(\|\| sin9_avx(test::black_box(freq), &mut obuf));
	}
	}

	#[bench]
	fn bench_packed_sin9(b: &mut Bencher) {
	let mut obuf = [0.0f32; 64];
	let freq = 0.1;
	unsafe {
	b.iter(\|\| packed_sin9(test::black_box(freq), &mut obuf));
	}
	}
	}