mayah/x86intrin-mandelbrot.rs

## x86intrin-mandelbrot.rs
#![feature(test)]
#![feature(cfg_target_feature)]

extern crate x86intrin;
extern crate test;

use test::black_box as bb;
use test::Bencher as B;
use x86intrin::*;

fn naive(c_x: f32, c_y: f32, max_iter: u32) -> u32 {
    let mut x = c_x;
    let mut y = c_y;
    let mut count = 0;
    while count < max_iter {
        let xy = x * y;
        let xx = x * x;
        let yy = y * y;
        let sum = xx + yy;
        if sum > 4.0 {
            break
        }
        count += 1;
        x = xx - yy + c_x;
        y = xy * 2.0 + c_y;
    }
    count
}

fn simd4(c_x: m128, c_y: m128, max_iter: u32) -> m128i {
    let mut x = c_x;
    let mut y = c_y;

    let mut count = mm_setzero_si128();
    for _ in 0..max_iter as usize {
        let xx = mm_mul_ps(x, x);
        let yy = mm_mul_ps(y, y);
        let sum = mm_add_ps(xx, yy);
        let mask = mm_cmplt_ps(sum, mm_set1_ps(4.0));

        if mm_testz_ps(mask, mask) == 1 {
            break;
        }

        count = mm_sub_epi32(count, mask.as_m128i());

        let xy = mm_mul_ps(x, y);
        x = mm_add_ps(mm_sub_ps(xx, yy), c_x);
        y = mm_add_ps(mm_add_ps(xy, xy), c_y);
    }

    count
}

fn simd8(c_x: m256, c_y: m256, max_iter: u32) -> m256i {
    let mut x = c_x;
    let mut y = c_y;

    let mut count = mm256_setzero_si256();
    for _ in 0..max_iter as usize {
        let xx = mm256_mul_ps(x, x);
        let yy = mm256_mul_ps(y, y);
        let sum = mm256_add_ps(xx, yy);

        let mask = mm256_cmp_ps(sum, mm256_set1_ps(4.0), CMP_LT_OS);

        if mm256_testz_ps(mask, mask) == 1 {
            break;
        }

        count = mm256_sub_epi32(count, mask.as_m256i());

        let xy = mm256_mul_ps(x, y);
        x = mm256_add_ps(mm256_sub_ps(xx, yy), c_x);
        y = mm256_add_ps(mm256_add_ps(xy, xy), c_y);
    }

    count
}

const SCALE: f32 = 3.0 / 128.0;
const N: u32 = 128;
const MAX_ITER: u32 = 255;

#[bench]
fn mandel_naive(b: &mut B) {
    b.iter(|| {
        for j in 0..N {
            let y = -1.5 + (j as f32) * SCALE;
            for i in 0..N {
                let x = -2.2 + (i as f32) * SCALE;
                bb(naive(x, y, MAX_ITER));
            }
        }
    })
}

#[bench]
fn mandel_simd4(b: &mut B) {
    let tweak = mm_setr_epi32(0, 1, 2, 3);
    b.iter(|| {
        for j in 0..N {
            let y = mm_add_ps(mm_set1_ps(-1.5), mm_mul_ps(mm_set1_ps(SCALE), mm_set1_ps(j as f32)));
            for i in 0..(N / 4) {
                let i = mm_add_epi32(mm_set1_epi32((i * 4) as i32), tweak);
                let x = mm_add_ps(mm_set1_ps(-2.2), mm_mul_ps(mm_set1_ps(SCALE), mm_cvtepi32_ps(i)));
                bb(simd4(x, y, MAX_ITER));
            }
        }
    })
}

#[bench]
fn mandel_simd8(b: &mut B) {
    let tweak = mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
    b.iter(|| {
        for j in 0..N {
            let y = mm256_add_ps(mm256_set1_ps(-1.5), mm256_mul_ps(mm256_set1_ps(SCALE), mm256_set1_ps(j as f32)));
            for i in 0..(N / 8) {
                let i = mm256_add_epi32(mm256_set1_epi32((i * 8) as i32), tweak);
                let x = mm256_add_ps(mm256_set1_ps(-2.2), mm256_mul_ps(mm256_set1_ps(SCALE), mm256_cvtepi32_ps(i)));
                bb(simd8(x, y, MAX_ITER));
            }
        }
    })
}

#[bench]
fn test_validity(b: &mut B) {
    let mut vs1 = Vec::new();
    {
        for j in 0..N {
            let y = -1.5 + (j as f32) * SCALE;
            for i in 0..N {
                let x = -2.2 + (i as f32) * SCALE;
                vs1.push(naive(x, y, MAX_ITER));
            }
        }
    }

    let mut vs2 = Vec::new();
    {
        let tweak = mm_setr_epi32(0, 1, 2, 3);
        for j in 0..N {
            let y = mm_add_ps(mm_set1_ps(-1.5), mm_mul_ps(mm_set1_ps(SCALE), mm_set1_ps(j as f32)));
            for i in 0..(N / 4) {
                let i = mm_add_epi32(mm_set1_epi32((i * 4) as i32), tweak);
                let x = mm_add_ps(mm_set1_ps(-2.2), mm_mul_ps(mm_set1_ps(SCALE), mm_cvtepi32_ps(i)));
                let t = simd4(x, y, MAX_ITER).as_u32x4().as_array();

                vs2.push(t[0]);
                vs2.push(t[1]);
                vs2.push(t[2]);
                vs2.push(t[3]);
            }
        }
    }

    let mut vs3 = Vec::new();
    {
        let tweak = mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
        for j in 0..N {
            let y = mm256_add_ps(mm256_set1_ps(-1.5), mm256_mul_ps(mm256_set1_ps(SCALE), mm256_set1_ps(j as f32)));
            for i in 0..(N / 8) {
                let i = mm256_add_epi32(mm256_set1_epi32((i * 8) as i32), tweak);
                let x = mm256_add_ps(mm256_set1_ps(-2.2), mm256_mul_ps(mm256_set1_ps(SCALE), mm256_cvtepi32_ps(i)));
                let t = simd8(x, y, MAX_ITER).as_u32x8().as_array();

                for v in &t {
                    vs3.push(*v);
                }
            }
        }
    }

    // If value is not the same, this should be triggered.
    assert_eq!(vs1, vs2);
    assert_eq!(vs1, vs3);
}
	#![feature(test)]
	#![feature(cfg_target_feature)]

	extern crate x86intrin;
	extern crate test;

	use test::black_box as bb;
	use test::Bencher as B;
	use x86intrin::*;

	fn naive(c_x: f32, c_y: f32, max_iter: u32) -> u32 {
	let mut x = c_x;
	let mut y = c_y;
	let mut count = 0;
	while count < max_iter {
	let xy = x * y;
	let xx = x * x;
	let yy = y * y;
	let sum = xx + yy;
	if sum > 4.0 {
	break
	}
	count += 1;
	x = xx - yy + c_x;
	y = xy * 2.0 + c_y;
	}
	count
	}

	fn simd4(c_x: m128, c_y: m128, max_iter: u32) -> m128i {
	let mut x = c_x;
	let mut y = c_y;

	let mut count = mm_setzero_si128();
	for _ in 0..max_iter as usize {
	let xx = mm_mul_ps(x, x);
	let yy = mm_mul_ps(y, y);
	let sum = mm_add_ps(xx, yy);
	let mask = mm_cmplt_ps(sum, mm_set1_ps(4.0));

	if mm_testz_ps(mask, mask) == 1 {
	break;
	}

	count = mm_sub_epi32(count, mask.as_m128i());

	let xy = mm_mul_ps(x, y);
	x = mm_add_ps(mm_sub_ps(xx, yy), c_x);
	y = mm_add_ps(mm_add_ps(xy, xy), c_y);
	}

	count
	}

	fn simd8(c_x: m256, c_y: m256, max_iter: u32) -> m256i {
	let mut x = c_x;
	let mut y = c_y;

	let mut count = mm256_setzero_si256();
	for _ in 0..max_iter as usize {
	let xx = mm256_mul_ps(x, x);
	let yy = mm256_mul_ps(y, y);
	let sum = mm256_add_ps(xx, yy);

	let mask = mm256_cmp_ps(sum, mm256_set1_ps(4.0), CMP_LT_OS);

	if mm256_testz_ps(mask, mask) == 1 {
	break;
	}

	count = mm256_sub_epi32(count, mask.as_m256i());

	let xy = mm256_mul_ps(x, y);
	x = mm256_add_ps(mm256_sub_ps(xx, yy), c_x);
	y = mm256_add_ps(mm256_add_ps(xy, xy), c_y);
	}

	count
	}

	const SCALE: f32 = 3.0 / 128.0;
	const N: u32 = 128;
	const MAX_ITER: u32 = 255;

	#[bench]
	fn mandel_naive(b: &mut B) {
	b.iter(\|\| {
	for j in 0..N {
	let y = -1.5 + (j as f32) * SCALE;
	for i in 0..N {
	let x = -2.2 + (i as f32) * SCALE;
	bb(naive(x, y, MAX_ITER));
	}
	}
	})
	}

	#[bench]
	fn mandel_simd4(b: &mut B) {
	let tweak = mm_setr_epi32(0, 1, 2, 3);
	b.iter(\|\| {
	for j in 0..N {
	let y = mm_add_ps(mm_set1_ps(-1.5), mm_mul_ps(mm_set1_ps(SCALE), mm_set1_ps(j as f32)));
	for i in 0..(N / 4) {
	let i = mm_add_epi32(mm_set1_epi32((i * 4) as i32), tweak);
	let x = mm_add_ps(mm_set1_ps(-2.2), mm_mul_ps(mm_set1_ps(SCALE), mm_cvtepi32_ps(i)));
	bb(simd4(x, y, MAX_ITER));
	}
	}
	})
	}

	#[bench]
	fn mandel_simd8(b: &mut B) {
	let tweak = mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
	b.iter(\|\| {
	for j in 0..N {
	let y = mm256_add_ps(mm256_set1_ps(-1.5), mm256_mul_ps(mm256_set1_ps(SCALE), mm256_set1_ps(j as f32)));
	for i in 0..(N / 8) {
	let i = mm256_add_epi32(mm256_set1_epi32((i * 8) as i32), tweak);
	let x = mm256_add_ps(mm256_set1_ps(-2.2), mm256_mul_ps(mm256_set1_ps(SCALE), mm256_cvtepi32_ps(i)));
	bb(simd8(x, y, MAX_ITER));
	}
	}
	})
	}

	#[bench]
	fn test_validity(b: &mut B) {
	let mut vs1 = Vec::new();
	{
	for j in 0..N {
	let y = -1.5 + (j as f32) * SCALE;
	for i in 0..N {
	let x = -2.2 + (i as f32) * SCALE;
	vs1.push(naive(x, y, MAX_ITER));
	}
	}
	}

	let mut vs2 = Vec::new();
	{
	let tweak = mm_setr_epi32(0, 1, 2, 3);
	for j in 0..N {
	let y = mm_add_ps(mm_set1_ps(-1.5), mm_mul_ps(mm_set1_ps(SCALE), mm_set1_ps(j as f32)));
	for i in 0..(N / 4) {
	let i = mm_add_epi32(mm_set1_epi32((i * 4) as i32), tweak);
	let x = mm_add_ps(mm_set1_ps(-2.2), mm_mul_ps(mm_set1_ps(SCALE), mm_cvtepi32_ps(i)));
	let t = simd4(x, y, MAX_ITER).as_u32x4().as_array();

	vs2.push(t[0]);
	vs2.push(t[1]);
	vs2.push(t[2]);
	vs2.push(t[3]);
	}
	}
	}

	let mut vs3 = Vec::new();
	{
	let tweak = mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
	for j in 0..N {
	let y = mm256_add_ps(mm256_set1_ps(-1.5), mm256_mul_ps(mm256_set1_ps(SCALE), mm256_set1_ps(j as f32)));
	for i in 0..(N / 8) {
	let i = mm256_add_epi32(mm256_set1_epi32((i * 8) as i32), tweak);
	let x = mm256_add_ps(mm256_set1_ps(-2.2), mm256_mul_ps(mm256_set1_ps(SCALE), mm256_cvtepi32_ps(i)));
	let t = simd8(x, y, MAX_ITER).as_u32x8().as_array();

	for v in &t {
	vs3.push(*v);
	}
	}
	}
	}

	// If value is not the same, this should be triggered.
	assert_eq!(vs1, vs2);
	assert_eq!(vs1, vs3);
	}