Skip to content

Instantly share code, notes, and snippets.

@hardbyte
Last active January 13, 2019 11:43
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save hardbyte/831dc10e1026873a53285f5049d75806 to your computer and use it in GitHub Desktop.
Save hardbyte/831dc10e1026873a53285f5049d75806 to your computer and use it in GitHub Desktop.
Making sure Julia can count bits.
#include <memory>
#include <functional>
#include <algorithm>
#include <vector>
#include <queue>
#include <stdint.h>
#include <cstdlib>
#include <ctime>
#include <cassert>
#include <climits>
static constexpr int WORD_BYTES = sizeof(uint64_t);
/**
* The popcount of n elements of buf is the sum of c0, c1, c2, c3.
*/
template<int n>
static inline void
popcount(
uint64_t &c0, uint64_t &c1, uint64_t &c2, uint64_t &c3,
const uint64_t *buf) {
popcount<4>(c0, c1, c2, c3, buf);
popcount<n - 4>(c0, c1, c2, c3, buf + 4);
}
// Fast Path
//
// Source: http://danluu.com/assembly-intrinsics/
// https://stackoverflow.com/questions/25078285/replacing-a-32-bit-loop-count-variable-with-64-bit-introduces-crazy-performance
//
// NB: Dan Luu's original assembly (and the SO answer it was based on)
// is incorrect because it clobbers registers marked as "input only"
// (see warning at
// https://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html#InputOperands
// -- this mistake does not materialise with GCC (4.9), but it does
// with Clang (3.6 and 3.8)). We fix the mistake by explicitly
// loading the contents of buf into registers and using these same
// registers for the intermediate popcnts.
/**
* The popcount of 4 elements of buf is the sum of c0, c1, c2, c3.
*/
template<>
inline void
popcount<4>(
uint64_t &c0, uint64_t &c1, uint64_t &c2, uint64_t &c3,
const uint64_t *buf) {
uint64_t b0, b1, b2, b3;
b0 = buf[0]; b1 = buf[1]; b2 = buf[2]; b3 = buf[3];
__asm__(
"popcnt %4, %4 \n\t"
"add %4, %0 \n\t"
"popcnt %5, %5 \n\t"
"add %5, %1 \n\t"
"popcnt %6, %6 \n\t"
"add %6, %2 \n\t"
"popcnt %7, %7 \n\t"
"add %7, %3 \n\t"
: "+r" (c0), "+r" (c1), "+r" (c2), "+r" (c3),
"+r" (b0), "+r" (b1), "+r" (b2), "+r" (b3));
}
// Slow paths
// TODO: Assumes sizeof(long) == 8
//
// NB: The specialisation to n=3 is not currently used but included
// for completeness (i.e. so that popcount<n> is defined for all
// non-negative n) and in anticipation of its use in the near future.
#if 0
template<>
inline void
popcount<3>(
uint64_t &c0, uint64_t &c1, uint64_t &c2, uint64_t &,
const uint64_t *buf) {
c0 += __builtin_popcountl(buf[0]);
c1 += __builtin_popcountl(buf[1]);
c2 += __builtin_popcountl(buf[2]);
}
#endif
/**
* The popcount of 2 elements of buf is the sum of c0, c1.
*/
template<>
inline void
popcount<2>(
uint64_t &c0, uint64_t &c1, uint64_t &, uint64_t &,
const uint64_t *buf) {
c0 += __builtin_popcountl(buf[0]);
c1 += __builtin_popcountl(buf[1]);
}
/**
* The popcount *buf is in c0.
*/
template<>
inline void
popcount<1>(
uint64_t &c0, uint64_t &, uint64_t &, uint64_t &,
const uint64_t *buf) {
c0 += __builtin_popcountl(buf[0]);
}
/**
* Calculate population counts of an array of inputs of nwords elements.
*
* 'arrays' must point to narrays*nwords*WORD_BYTES bytes
* 'counts' must point to narrays*sizeof(uint32_t) bytes.
* For i = 0 to narrays - 1, the population count of the nwords elements
*
* arrays[i * nwords] ... arrays[(i + 1) * nwords - 1]
*
* is put in counts[i].
*/
template<int nwords>
static void
_popcount_arrays(uint32_t *counts, const uint64_t *arrays, int narrays) {
uint64_t c0, c1, c2, c3;
for (int i = 0; i < narrays; ++i, arrays += nwords) {
c0 = c1 = c2 = c3 = 0;
popcount<nwords>(c0, c1, c2, c3, arrays);
counts[i] = c0 + c1 + c2 + c3;
}
}
/**
* Return the popcount of the nwords elements starting at array.
*/
static uint32_t
_popcount_array(const uint64_t *array, int nwords) {
uint64_t c0, c1, c2, c3;
c0 = c1 = c2 = c3 = 0;
while (nwords >= 16) {
popcount<16>(c0, c1, c2, c3, array);
array += 16;
nwords -= 16;
}
// nwords < 16
if (nwords >= 8) {
popcount<8>(c0, c1, c2, c3, array);
array += 8;
nwords -= 8;
}
// nwords < 8
if (nwords >= 4) {
popcount<4>(c0, c1, c2, c3, array);
array += 4;
nwords -= 4;
}
// nwords < 4
if (nwords >= 2) {
popcount<2>(c0, c1, c2, c3, array);
array += 2;
nwords -= 2;
}
// nwords < 2
if (nwords == 1)
popcount<1>(c0, c1, c2, c3, array);
return c0 + c1 + c2 + c3;
}
/**
* Convert clock measurement t to milliseconds.
*
* t should have been obtained as the difference of calls to clock()
* for this to make sense.
*/
static inline double
to_millis(clock_t t) {
static constexpr double CPS = (double)CLOCKS_PER_SEC;
return t * 1.0E3 / CPS;
}
static inline uint32_t
abs_diff(uint32_t a, uint32_t b) {
if (a > b)
return a - b;
return b - a;
}
typedef const uint64_t word_tp;
typedef std::function<void (word_tp *)> deleter_fn;
typedef std::unique_ptr<word_tp, deleter_fn> word_ptr;
/**
* Given a char pointer ptr pointing to nbytes bytes, return a
* std::unique_ptr to uint64_t containing those same byte values
* (using the host's byte order, i.e. no byte reordering is done).
*
* The main purpose of this function is to fix pointer alignment
* issues when converting from a char pointer to a uint64 pointer; the
* issue being that a uint64 pointer has stricter alignment
* requirements than a char pointer.
*
* We assume that releasing the memory associated with ptr is someone
* else's problem. So, if ptr is already correctly aligned, we just
* cast it to uint64 and return a unique_ptr whose deleter is
* empty. If ptr is misaligned, then we copy the nbytes at ptr to a
* location that is correctly aligned and then return a unique_ptr
* whose deleter will delete that allocated space when the unique_ptr
* goes out of scope.
*
* NB: ASSUMES nbytes is divisible by WORD_BYTES.
*
* TODO: On some architectures it might be advantageous to have
* 16-byte aligned memory, even when the words used are only 8 bytes
* (this is to do with cache line size). This function could be easily
* modified to allow experimentation with different alignments.
*/
static word_ptr
adjust_ptr_alignment(const char *ptr, size_t nbytes) {
static constexpr size_t wordbytes = sizeof(word_tp);
size_t nwords = nbytes / wordbytes;
uintptr_t addr = reinterpret_cast<uintptr_t>(ptr);
// ptr is correctly aligned if addr is divisible by wordbytes
if (addr % wordbytes != 0) {
// ptr is NOT correctly aligned
// copy_words has correct alignment
uint64_t *copy_words = new uint64_t[nwords];
// alias copy_words with a byte pointer; the cast safe because
// uint8_t alignment is less restrictive than uint64_t
uint8_t *copy_bytes = reinterpret_cast<uint8_t *>(copy_words);
// copy everything across
std::copy(ptr, ptr + nbytes, copy_bytes);
// return a unique_ptr with array delete to delete copy_words
auto array_delete = [] (word_tp *p) { delete[] p; };
return word_ptr(copy_words, array_delete);
} else {
// ptr IS correctly aligned
// safe cast because the address of ptr is wordbyte-aligned.
word_tp *same = reinterpret_cast<word_tp *>(ptr);
// don't delete backing array since we don't own it
auto empty_delete = [] (word_tp *) { };
return word_ptr(same, empty_delete);
}
}
extern "C"
{
/**
* Calculate population counts of an array of inputs; return how
* long it took in milliseconds.
*
* 'arrays' must point to narrays*array_bytes bytes
* 'counts' must point to narrays*sizeof(uint32_t) bytes.
* For i = 0 to n - 1, the population count of the array_bytes*8 bits
*
* arrays[i * array_bytes] ... arrays[(i + 1) * array_bytes - 1]
*
* is put in counts[i].
*
* ASSUMES: array_bytes is divisible by 8.
*/
double
popcount_arrays(
uint32_t *counts,
const char *arrays, int narrays, int array_bytes) {
// assumes WORD_BYTES divides array_bytes
int nwords = array_bytes / WORD_BYTES;
// The static_cast is to avoid int overflow in the multiplication
size_t total_bytes = static_cast<size_t>(narrays) * array_bytes;
auto ptr = adjust_ptr_alignment(arrays, total_bytes);
auto u = ptr.get();
clock_t t = clock();
switch (nwords) {
case 32: _popcount_arrays<32>(counts, u, narrays); break;
case 16: _popcount_arrays<16>(counts, u, narrays); break;
case 8: _popcount_arrays< 8>(counts, u, narrays); break;
default:
for (int i = 0; i < narrays; ++i, u += nwords)
counts[i] = _popcount_array(u, nwords);
}
return to_millis(clock() - t);
}
}
Display the source blob
Display the rendered blob
Raw
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment