Created
November 29, 2016 16:42
-
-
Save mayah/fac2f92bb963c09166c19b2ad8b3dcbf to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#![feature(test)] | |
#![feature(cfg_target_feature)] | |
extern crate x86intrin; | |
extern crate test; | |
use test::black_box as bb; | |
use test::Bencher as B; | |
use x86intrin::*; | |
fn naive(c_x: f32, c_y: f32, max_iter: u32) -> u32 { | |
let mut x = c_x; | |
let mut y = c_y; | |
let mut count = 0; | |
while count < max_iter { | |
let xy = x * y; | |
let xx = x * x; | |
let yy = y * y; | |
let sum = xx + yy; | |
if sum > 4.0 { | |
break | |
} | |
count += 1; | |
x = xx - yy + c_x; | |
y = xy * 2.0 + c_y; | |
} | |
count | |
} | |
fn simd4(c_x: m128, c_y: m128, max_iter: u32) -> m128i { | |
let mut x = c_x; | |
let mut y = c_y; | |
let mut count = mm_setzero_si128(); | |
for _ in 0..max_iter as usize { | |
let xx = mm_mul_ps(x, x); | |
let yy = mm_mul_ps(y, y); | |
let sum = mm_add_ps(xx, yy); | |
let mask = mm_cmplt_ps(sum, mm_set1_ps(4.0)); | |
if mm_testz_ps(mask, mask) == 1 { | |
break; | |
} | |
count = mm_sub_epi32(count, mask.as_m128i()); | |
let xy = mm_mul_ps(x, y); | |
x = mm_add_ps(mm_sub_ps(xx, yy), c_x); | |
y = mm_add_ps(mm_add_ps(xy, xy), c_y); | |
} | |
count | |
} | |
fn simd8(c_x: m256, c_y: m256, max_iter: u32) -> m256i { | |
let mut x = c_x; | |
let mut y = c_y; | |
let mut count = mm256_setzero_si256(); | |
for _ in 0..max_iter as usize { | |
let xx = mm256_mul_ps(x, x); | |
let yy = mm256_mul_ps(y, y); | |
let sum = mm256_add_ps(xx, yy); | |
let mask = mm256_cmp_ps(sum, mm256_set1_ps(4.0), CMP_LT_OS); | |
if mm256_testz_ps(mask, mask) == 1 { | |
break; | |
} | |
count = mm256_sub_epi32(count, mask.as_m256i()); | |
let xy = mm256_mul_ps(x, y); | |
x = mm256_add_ps(mm256_sub_ps(xx, yy), c_x); | |
y = mm256_add_ps(mm256_add_ps(xy, xy), c_y); | |
} | |
count | |
} | |
const SCALE: f32 = 3.0 / 128.0; | |
const N: u32 = 128; | |
const MAX_ITER: u32 = 255; | |
#[bench] | |
fn mandel_naive(b: &mut B) { | |
b.iter(|| { | |
for j in 0..N { | |
let y = -1.5 + (j as f32) * SCALE; | |
for i in 0..N { | |
let x = -2.2 + (i as f32) * SCALE; | |
bb(naive(x, y, MAX_ITER)); | |
} | |
} | |
}) | |
} | |
#[bench] | |
fn mandel_simd4(b: &mut B) { | |
let tweak = mm_setr_epi32(0, 1, 2, 3); | |
b.iter(|| { | |
for j in 0..N { | |
let y = mm_add_ps(mm_set1_ps(-1.5), mm_mul_ps(mm_set1_ps(SCALE), mm_set1_ps(j as f32))); | |
for i in 0..(N / 4) { | |
let i = mm_add_epi32(mm_set1_epi32((i * 4) as i32), tweak); | |
let x = mm_add_ps(mm_set1_ps(-2.2), mm_mul_ps(mm_set1_ps(SCALE), mm_cvtepi32_ps(i))); | |
bb(simd4(x, y, MAX_ITER)); | |
} | |
} | |
}) | |
} | |
#[bench] | |
fn mandel_simd8(b: &mut B) { | |
let tweak = mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7); | |
b.iter(|| { | |
for j in 0..N { | |
let y = mm256_add_ps(mm256_set1_ps(-1.5), mm256_mul_ps(mm256_set1_ps(SCALE), mm256_set1_ps(j as f32))); | |
for i in 0..(N / 8) { | |
let i = mm256_add_epi32(mm256_set1_epi32((i * 8) as i32), tweak); | |
let x = mm256_add_ps(mm256_set1_ps(-2.2), mm256_mul_ps(mm256_set1_ps(SCALE), mm256_cvtepi32_ps(i))); | |
bb(simd8(x, y, MAX_ITER)); | |
} | |
} | |
}) | |
} | |
#[bench] | |
fn test_validity(b: &mut B) { | |
let mut vs1 = Vec::new(); | |
{ | |
for j in 0..N { | |
let y = -1.5 + (j as f32) * SCALE; | |
for i in 0..N { | |
let x = -2.2 + (i as f32) * SCALE; | |
vs1.push(naive(x, y, MAX_ITER)); | |
} | |
} | |
} | |
let mut vs2 = Vec::new(); | |
{ | |
let tweak = mm_setr_epi32(0, 1, 2, 3); | |
for j in 0..N { | |
let y = mm_add_ps(mm_set1_ps(-1.5), mm_mul_ps(mm_set1_ps(SCALE), mm_set1_ps(j as f32))); | |
for i in 0..(N / 4) { | |
let i = mm_add_epi32(mm_set1_epi32((i * 4) as i32), tweak); | |
let x = mm_add_ps(mm_set1_ps(-2.2), mm_mul_ps(mm_set1_ps(SCALE), mm_cvtepi32_ps(i))); | |
let t = simd4(x, y, MAX_ITER).as_u32x4().as_array(); | |
vs2.push(t[0]); | |
vs2.push(t[1]); | |
vs2.push(t[2]); | |
vs2.push(t[3]); | |
} | |
} | |
} | |
let mut vs3 = Vec::new(); | |
{ | |
let tweak = mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7); | |
for j in 0..N { | |
let y = mm256_add_ps(mm256_set1_ps(-1.5), mm256_mul_ps(mm256_set1_ps(SCALE), mm256_set1_ps(j as f32))); | |
for i in 0..(N / 8) { | |
let i = mm256_add_epi32(mm256_set1_epi32((i * 8) as i32), tweak); | |
let x = mm256_add_ps(mm256_set1_ps(-2.2), mm256_mul_ps(mm256_set1_ps(SCALE), mm256_cvtepi32_ps(i))); | |
let t = simd8(x, y, MAX_ITER).as_u32x8().as_array(); | |
for v in &t { | |
vs3.push(*v); | |
} | |
} | |
} | |
} | |
// If value is not the same, this should be triggered. | |
assert_eq!(vs1, vs2); | |
assert_eq!(vs1, vs3); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment