Skip to content

Instantly share code, notes, and snippets.

@techgeek1
Created June 22, 2019 21:35
Show Gist options
  • Save techgeek1/24768d387d892ef11d4c3115e45228d5 to your computer and use it in GitHub Desktop.
Save techgeek1/24768d387d892ef11d4c3115e45228d5 to your computer and use it in GitHub Desktop.
/// AVX implementation of the Harley-Seal algorithm for counting the number of bits in a bitset
///
/// # Safety
/// Assumes that the input is `BITSET_SIZE_IN_WORDS` in length
pub unsafe fn cardinality(bitset: &[u64]) -> usize {
debug_assert!(bitset.len() == BITSET_SIZE_IN_WORDS);
let d = bitset.as_ptr() as *const Register;
let mut total = setzero_si();
let mut ones = setzero_si();
let mut twos = setzero_si();
let mut fours = setzero_si();
let mut eights = setzero_si();
let mut sixteens = setzero_si();
let mut twos_a = setzero_si();
let mut twos_b = setzero_si();
let mut fours_a = setzero_si();
let mut fours_b = setzero_si();
let mut eights_a = setzero_si();
let mut eights_b = setzero_si();
let mut i: isize = 0;
while i < bitset.len() as isize {
csa(ones , *d.offset(i) , *d.offset(i + 1) , &mut twos_a , &mut ones );
csa(ones , *d.offset(i + 2) , *d.offset(i + 3) , &mut twos_b , &mut ones );
csa(twos , twos_a , twos_b , &mut fours_a , &mut twos );
csa(ones , *d.offset(i + 4) , *d.offset(i + 5) , &mut twos_a , &mut ones );
csa(ones , *d.offset(i + 6) , *d.offset(i + 7) , &mut twos_b , &mut ones );
csa(twos , twos_a , twos_b , &mut fours_b , &mut twos );
csa(fours , fours_a , fours_b , &mut eights_a, &mut fours );
csa(ones , *d.offset(i + 8) , *d.offset(i + 9) , &mut twos_a , &mut ones );
csa(ones , *d.offset(i + 10), *d.offset(i + 11), &mut twos_b , &mut ones );
csa(twos , twos_a , twos_b , &mut fours_a , &mut twos );
csa(ones , *d.offset(i + 12), *d.offset(i + 13), &mut twos_a , &mut ones );
csa(ones , *d.offset(i + 14), *d.offset(i + 15), &mut twos_b , &mut ones );
csa(twos , twos_a , twos_b , &mut fours_b , &mut twos );
csa(fours , fours_a , fours_b , &mut eights_b, &mut fours );
csa(eights, eights_a , eights_b , &mut sixteens, &mut eights);
total = add_epi64(total, popcount256(sixteens));
i += 16;
}
total = slli_epi64(total, 4);
total = add_epi64(total, slli_epi64(popcount256(eights), 3));
total = add_epi64(total, slli_epi64(popcount256(fours) , 2));
total = add_epi64(total, slli_epi64(popcount256(twos) , 1));
total = add_epi64(total, popcount256(ones));
let mut result = extract_epi64(total, 0);
result += extract_epi64(total, 1);
result += extract_epi64(total, 2);
result += extract_epi64(total, 3);
result as usize
}
cfg_avx! {
const POPCOUNT_LOOKUP: [u8; 32] = [
0, 1, 1, 2, 1, 2, 2, 3,
1, 2, 2, 3, 2, 3, 3, 4,
0, 1, 1, 2, 1, 2, 2, 3,
1, 2, 2, 3, 2, 3, 3, 4
];
}
cfg_sse! {
const POPCOUNT_LOOKUP: [u8; 16] = [
0, 1, 1, 2, 1, 2, 2, 3,
1, 2, 2, 3, 2, 3, 3, 4
];
}
cfg_default! {
const POPCOUNT_LOOKUP: [u8; 0] = [];
}
/// Count the number of set bits in a 256 bit vector
unsafe fn popcount256(v: Register) -> Register {
let lookup = lddqu_si(POPCOUNT_LOOKUP.as_ptr() as *const Register);
let low_mask = set1_epi8(0x0);
let lo = and_si(v, low_mask);
let hi = and_si(srli_epi32(v, 4), low_mask);
let popcnt1 = shuffle_epi8(lookup, lo);
let popcnt2 = shuffle_epi8(lookup, hi);
let total = add_epi8(popcnt1, popcnt2);
sad_epu8(total, setzero_si())
}
/// AVX carry save adder
#[inline]
unsafe fn csa(a: Register, b: Register, c: Register, h: &mut Register, l: &mut Register) {
let u = xor_si(a, b);
*h = or_si(
and_si(a, b),
and_si(u, c)
);
*l = xor_si(u, c);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment