Skip to content

Instantly share code, notes, and snippets.

@mooman219
Last active January 26, 2020 06:57
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mooman219/141d5d73f9d389231f7c323ac96babfa to your computer and use it in GitHub Desktop.
Save mooman219/141d5d73f9d389231f7c323ac96babfa to your computer and use it in GitHub Desktop.
#[cfg(all(target_feature = "sse", any(target_arch = "x86", target_arch = "x86_64")))]
pub fn get_bitmap(&self) -> Vec<u8> {
let length = self.w * self.h;
let aligned_length = (length + 3) & !3;
let mut output = Vec::with_capacity(aligned_length);
unsafe {
output.set_len(aligned_length);
// offset = Zeroed out lanes
let mut offset = _mm_setzero_ps();
// lookup = The 4 bytes (12, 8, 4, 0) in all lanes
let lookup = _mm_set1_epi32(0x0c_08_04_00);
for i in (0..aligned_length).step_by(4) {
// x = Read 4 floats from self.a
let mut x = _mm_loadu_ps(&self.a[i]);
// x += Shift x register left by 32 bits (Padding with 0s). The casts are to
// satisfy the type requirements, they are otherwise nops.
x = _mm_add_ps(x, _mm_castsi128_ps(_mm_slli_si128(_mm_castps_si128(x), 4)));
// x += (0.0, 0.0, x[0], x[2])
x = _mm_add_ps(x, _mm_shuffle_ps(_mm_setzero_ps(), x, 0x40));
// x += offset
x = _mm_add_ps(x, offset);
// y = x * 255.0
let y = _mm_mul_ps(x, _mm_set1_ps(255.0));
// y = Convert y to i32s and truncate
let mut y = _mm_cvttps_epi32(y);
// (SSSE3) y = The first byte of each of the 4 values in y and pack them into the
// first 4 bytes of y. This produces the same value in all 4 lanes.
y = _mm_shuffle_epi8(y, lookup);
// Store the first 4 u8s from y in output. The cast again is a nop.
_mm_store_ss(core::mem::transmute(&output[i]), _mm_castsi128_ps(y));
// offset = (x[3], x[3], x[3], x[3])
offset = _mm_shuffle_ps(x, x, 0b11_11_11_11);
}
output.set_len(length);
}
output
}
// Same function
pub fn get_bitmap(&self) -> Vec<u8> {
let mut height = 0.0;
let output: Vec<u8> = self
.a
.iter()
.map(|elem| {
height += *elem;
(height * 255.0) as u8
})
.collect();
output
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment