Skip to content

Instantly share code, notes, and snippets.

@nico-abram
Last active December 23, 2021 19:39
Show Gist options
  • Save nico-abram/279aebfe5727fc1791b6f70b5a0a2038 to your computer and use it in GitHub Desktop.
Save nico-abram/279aebfe5727fc1791b6f70b5a0a2038 to your computer and use it in GitHub Desktop.
fn image_dist_naivef32(
x_offset: usize,
y_offset: usize,
search_img: &[f32],
search_w: usize,
subimage: &[f32],
w: usize,
h: usize,
) -> f32 {
let calc_dist = |a, b| (a - b) * (a - b);
let mut dist = 0.0f32;
let search_img = &search_img[(x_offset + y_offset * search_w) as usize..];
let mut subimage = &subimage[..(w * h * 3) as usize];
let mut search_img = &search_img[..(h * search_w * 3) as usize];
for y in 0..h {
let row_sub = (y * w * 3) as usize;
let row_search = (y * search_w * 3) as usize;
let mut subimage = &subimage[row_sub..];
let mut search_img = &search_img[row_search..];
for x in 0..(w * 3) {
dist += calc_dist(subimage[x], search_img[x]);
}
}
dist.sqrt() / w as f32 / h as f32
}
use simdeez::avx2::*;
use simdeez::scalar::*;
use simdeez::sse2::*;
use simdeez::sse41::*;
use simdeez::*;
simd_runtime_generate!(
fn image_dist_simdeez(
x_offset: usize,
y_offset: usize,
search_img: &[f32],
search_w: usize,
subimage: &[f32],
w: usize,
h: usize,
threshold: f32,
) -> f32 {
assert!(w % S::VF32_WIDTH == 0);
let mut subimage = &subimage[..(w * h) as usize];
let search_img = &search_img[(x_offset + y_offset * search_w) as usize..];
let mut search_img = &search_img[..(h * search_w) as usize];
// [0.0; S::VF32_WIDTH] gave me a const generics error
// In my case it's 8, 32 should be plenty conservative
let zeroes = [0.0; 32];
let mut res_simd = S::loadu_ps(&zeroes[0]);
let simd_iters_per_row = w / S::VF32_WIDTH;
for y in 0..h {
let row_sub = (y * w) as usize;
let row_search = (y * search_w) as usize;
let mut subimage = &*subimage.get_unchecked(row_sub..);
let mut search_img = &*search_img.get_unchecked(row_search..);
for _ in 0..simd_iters_per_row {
let search = S::loadu_ps(&*search_img.get_unchecked(0));
let sub = S::loadu_ps(&*subimage.get_unchecked(0));
let diff = S::sub_ps(sub, search);
let square = S::mul_ps(diff, diff);
res_simd = S::add_ps(res_simd, square);
subimage = &*subimage.get_unchecked(S::VF32_WIDTH..);
search_img = &*search_img.get_unchecked(S::VF32_WIDTH..);
}
}
let res = S::horizontal_add_ps(res_simd);
res.sqrt() / w as f32 / h as f32
}
);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment