Create a gist now

Instantly share code, notes, and snippets.

What would you like to do?
#![feature(test)]
#![feature(cfg_target_feature)]
extern crate x86intrin;
extern crate test;
use test::black_box as bb;
use test::Bencher as B;
use x86intrin::*;
fn naive(c_x: f32, c_y: f32, max_iter: u32) -> u32 {
let mut x = c_x;
let mut y = c_y;
let mut count = 0;
while count < max_iter {
let xy = x * y;
let xx = x * x;
let yy = y * y;
let sum = xx + yy;
if sum > 4.0 {
break
}
count += 1;
x = xx - yy + c_x;
y = xy * 2.0 + c_y;
}
count
}
fn simd4(c_x: m128, c_y: m128, max_iter: u32) -> m128i {
let mut x = c_x;
let mut y = c_y;
let mut count = mm_setzero_si128();
for _ in 0..max_iter as usize {
let xx = mm_mul_ps(x, x);
let yy = mm_mul_ps(y, y);
let sum = mm_add_ps(xx, yy);
let mask = mm_cmplt_ps(sum, mm_set1_ps(4.0));
if mm_testz_ps(mask, mask) == 1 {
break;
}
count = mm_sub_epi32(count, mask.as_m128i());
let xy = mm_mul_ps(x, y);
x = mm_add_ps(mm_sub_ps(xx, yy), c_x);
y = mm_add_ps(mm_add_ps(xy, xy), c_y);
}
count
}
fn simd8(c_x: m256, c_y: m256, max_iter: u32) -> m256i {
let mut x = c_x;
let mut y = c_y;
let mut count = mm256_setzero_si256();
for _ in 0..max_iter as usize {
let xx = mm256_mul_ps(x, x);
let yy = mm256_mul_ps(y, y);
let sum = mm256_add_ps(xx, yy);
let mask = mm256_cmp_ps(sum, mm256_set1_ps(4.0), CMP_LT_OS);
if mm256_testz_ps(mask, mask) == 1 {
break;
}
count = mm256_sub_epi32(count, mask.as_m256i());
let xy = mm256_mul_ps(x, y);
x = mm256_add_ps(mm256_sub_ps(xx, yy), c_x);
y = mm256_add_ps(mm256_add_ps(xy, xy), c_y);
}
count
}
const SCALE: f32 = 3.0 / 128.0;
const N: u32 = 128;
const MAX_ITER: u32 = 255;
#[bench]
fn mandel_naive(b: &mut B) {
b.iter(|| {
for j in 0..N {
let y = -1.5 + (j as f32) * SCALE;
for i in 0..N {
let x = -2.2 + (i as f32) * SCALE;
bb(naive(x, y, MAX_ITER));
}
}
})
}
#[bench]
fn mandel_simd4(b: &mut B) {
let tweak = mm_setr_epi32(0, 1, 2, 3);
b.iter(|| {
for j in 0..N {
let y = mm_add_ps(mm_set1_ps(-1.5), mm_mul_ps(mm_set1_ps(SCALE), mm_set1_ps(j as f32)));
for i in 0..(N / 4) {
let i = mm_add_epi32(mm_set1_epi32((i * 4) as i32), tweak);
let x = mm_add_ps(mm_set1_ps(-2.2), mm_mul_ps(mm_set1_ps(SCALE), mm_cvtepi32_ps(i)));
bb(simd4(x, y, MAX_ITER));
}
}
})
}
#[bench]
fn mandel_simd8(b: &mut B) {
let tweak = mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
b.iter(|| {
for j in 0..N {
let y = mm256_add_ps(mm256_set1_ps(-1.5), mm256_mul_ps(mm256_set1_ps(SCALE), mm256_set1_ps(j as f32)));
for i in 0..(N / 8) {
let i = mm256_add_epi32(mm256_set1_epi32((i * 8) as i32), tweak);
let x = mm256_add_ps(mm256_set1_ps(-2.2), mm256_mul_ps(mm256_set1_ps(SCALE), mm256_cvtepi32_ps(i)));
bb(simd8(x, y, MAX_ITER));
}
}
})
}
#[bench]
fn test_validity(b: &mut B) {
let mut vs1 = Vec::new();
{
for j in 0..N {
let y = -1.5 + (j as f32) * SCALE;
for i in 0..N {
let x = -2.2 + (i as f32) * SCALE;
vs1.push(naive(x, y, MAX_ITER));
}
}
}
let mut vs2 = Vec::new();
{
let tweak = mm_setr_epi32(0, 1, 2, 3);
for j in 0..N {
let y = mm_add_ps(mm_set1_ps(-1.5), mm_mul_ps(mm_set1_ps(SCALE), mm_set1_ps(j as f32)));
for i in 0..(N / 4) {
let i = mm_add_epi32(mm_set1_epi32((i * 4) as i32), tweak);
let x = mm_add_ps(mm_set1_ps(-2.2), mm_mul_ps(mm_set1_ps(SCALE), mm_cvtepi32_ps(i)));
let t = simd4(x, y, MAX_ITER).as_u32x4().as_array();
vs2.push(t[0]);
vs2.push(t[1]);
vs2.push(t[2]);
vs2.push(t[3]);
}
}
}
let mut vs3 = Vec::new();
{
let tweak = mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
for j in 0..N {
let y = mm256_add_ps(mm256_set1_ps(-1.5), mm256_mul_ps(mm256_set1_ps(SCALE), mm256_set1_ps(j as f32)));
for i in 0..(N / 8) {
let i = mm256_add_epi32(mm256_set1_epi32((i * 8) as i32), tweak);
let x = mm256_add_ps(mm256_set1_ps(-2.2), mm256_mul_ps(mm256_set1_ps(SCALE), mm256_cvtepi32_ps(i)));
let t = simd8(x, y, MAX_ITER).as_u32x8().as_array();
for v in &t {
vs3.push(*v);
}
}
}
}
// If value is not the same, this should be triggered.
assert_eq!(vs1, vs2);
assert_eq!(vs1, vs3);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment