Skip to content

Instantly share code, notes, and snippets.

@hardbyte
Last active January 13, 2019 11:43
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save hardbyte/831dc10e1026873a53285f5049d75806 to your computer and use it in GitHub Desktop.
Save hardbyte/831dc10e1026873a53285f5049d75806 to your computer and use it in GitHub Desktop.
Making sure Julia can count bits.
#include <memory>
#include <functional>
#include <algorithm>
#include <vector>
#include <queue>
#include <stdint.h>
#include <cstdlib>
#include <ctime>
#include <cassert>
#include <climits>
static constexpr int WORD_BYTES = sizeof(uint64_t);
/**
* The popcount of n elements of buf is the sum of c0, c1, c2, c3.
*/
template<int n>
static inline void
popcount(
uint64_t &c0, uint64_t &c1, uint64_t &c2, uint64_t &c3,
const uint64_t *buf) {
popcount<4>(c0, c1, c2, c3, buf);
popcount<n - 4>(c0, c1, c2, c3, buf + 4);
}
// Fast Path
//
// Source: http://danluu.com/assembly-intrinsics/
// https://stackoverflow.com/questions/25078285/replacing-a-32-bit-loop-count-variable-with-64-bit-introduces-crazy-performance
//
// NB: Dan Luu's original assembly (and the SO answer it was based on)
// is incorrect because it clobbers registers marked as "input only"
// (see warning at
// https://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html#InputOperands
// -- this mistake does not materialise with GCC (4.9), but it does
// with Clang (3.6 and 3.8)). We fix the mistake by explicitly
// loading the contents of buf into registers and using these same
// registers for the intermediate popcnts.
/**
* The popcount of 4 elements of buf is the sum of c0, c1, c2, c3.
*/
template<>
inline void
popcount<4>(
uint64_t &c0, uint64_t &c1, uint64_t &c2, uint64_t &c3,
const uint64_t *buf) {
uint64_t b0, b1, b2, b3;
b0 = buf[0]; b1 = buf[1]; b2 = buf[2]; b3 = buf[3];
__asm__(
"popcnt %4, %4 \n\t"
"add %4, %0 \n\t"
"popcnt %5, %5 \n\t"
"add %5, %1 \n\t"
"popcnt %6, %6 \n\t"
"add %6, %2 \n\t"
"popcnt %7, %7 \n\t"
"add %7, %3 \n\t"
: "+r" (c0), "+r" (c1), "+r" (c2), "+r" (c3),
"+r" (b0), "+r" (b1), "+r" (b2), "+r" (b3));
}
// Slow paths
// TODO: Assumes sizeof(long) == 8
//
// NB: The specialisation to n=3 is not currently used but included
// for completeness (i.e. so that popcount<n> is defined for all
// non-negative n) and in anticipation of its use in the near future.
#if 0
template<>
inline void
popcount<3>(
uint64_t &c0, uint64_t &c1, uint64_t &c2, uint64_t &,
const uint64_t *buf) {
c0 += __builtin_popcountl(buf[0]);
c1 += __builtin_popcountl(buf[1]);
c2 += __builtin_popcountl(buf[2]);
}
#endif
/**
* The popcount of 2 elements of buf is the sum of c0, c1.
*/
template<>
inline void
popcount<2>(
uint64_t &c0, uint64_t &c1, uint64_t &, uint64_t &,
const uint64_t *buf) {
c0 += __builtin_popcountl(buf[0]);
c1 += __builtin_popcountl(buf[1]);
}
/**
* The popcount *buf is in c0.
*/
template<>
inline void
popcount<1>(
uint64_t &c0, uint64_t &, uint64_t &, uint64_t &,
const uint64_t *buf) {
c0 += __builtin_popcountl(buf[0]);
}
/**
* Calculate population counts of an array of inputs of nwords elements.
*
* 'arrays' must point to narrays*nwords*WORD_BYTES bytes
* 'counts' must point to narrays*sizeof(uint32_t) bytes.
* For i = 0 to narrays - 1, the population count of the nwords elements
*
* arrays[i * nwords] ... arrays[(i + 1) * nwords - 1]
*
* is put in counts[i].
*/
template<int nwords>
static void
_popcount_arrays(uint32_t *counts, const uint64_t *arrays, int narrays) {
uint64_t c0, c1, c2, c3;
for (int i = 0; i < narrays; ++i, arrays += nwords) {
c0 = c1 = c2 = c3 = 0;
popcount<nwords>(c0, c1, c2, c3, arrays);
counts[i] = c0 + c1 + c2 + c3;
}
}
/**
* Return the popcount of the nwords elements starting at array.
*/
static uint32_t
_popcount_array(const uint64_t *array, int nwords) {
uint64_t c0, c1, c2, c3;
c0 = c1 = c2 = c3 = 0;
while (nwords >= 16) {
popcount<16>(c0, c1, c2, c3, array);
array += 16;
nwords -= 16;
}
// nwords < 16
if (nwords >= 8) {
popcount<8>(c0, c1, c2, c3, array);
array += 8;
nwords -= 8;
}
// nwords < 8
if (nwords >= 4) {
popcount<4>(c0, c1, c2, c3, array);
array += 4;
nwords -= 4;
}
// nwords < 4
if (nwords >= 2) {
popcount<2>(c0, c1, c2, c3, array);
array += 2;
nwords -= 2;
}
// nwords < 2
if (nwords == 1)
popcount<1>(c0, c1, c2, c3, array);
return c0 + c1 + c2 + c3;
}
/**
* Convert clock measurement t to milliseconds.
*
* t should have been obtained as the difference of calls to clock()
* for this to make sense.
*/
static inline double
to_millis(clock_t t) {
static constexpr double CPS = (double)CLOCKS_PER_SEC;
return t * 1.0E3 / CPS;
}
static inline uint32_t
abs_diff(uint32_t a, uint32_t b) {
if (a > b)
return a - b;
return b - a;
}
typedef const uint64_t word_tp;
typedef std::function<void (word_tp *)> deleter_fn;
typedef std::unique_ptr<word_tp, deleter_fn> word_ptr;
/**
* Given a char pointer ptr pointing to nbytes bytes, return a
* std::unique_ptr to uint64_t containing those same byte values
* (using the host's byte order, i.e. no byte reordering is done).
*
* The main purpose of this function is to fix pointer alignment
* issues when converting from a char pointer to a uint64 pointer; the
* issue being that a uint64 pointer has stricter alignment
* requirements than a char pointer.
*
* We assume that releasing the memory associated with ptr is someone
* else's problem. So, if ptr is already correctly aligned, we just
* cast it to uint64 and return a unique_ptr whose deleter is
* empty. If ptr is misaligned, then we copy the nbytes at ptr to a
* location that is correctly aligned and then return a unique_ptr
* whose deleter will delete that allocated space when the unique_ptr
* goes out of scope.
*
* NB: ASSUMES nbytes is divisible by WORD_BYTES.
*
* TODO: On some architectures it might be advantageous to have
* 16-byte aligned memory, even when the words used are only 8 bytes
* (this is to do with cache line size). This function could be easily
* modified to allow experimentation with different alignments.
*/
static word_ptr
adjust_ptr_alignment(const char *ptr, size_t nbytes) {
static constexpr size_t wordbytes = sizeof(word_tp);
size_t nwords = nbytes / wordbytes;
uintptr_t addr = reinterpret_cast<uintptr_t>(ptr);
// ptr is correctly aligned if addr is divisible by wordbytes
if (addr % wordbytes != 0) {
// ptr is NOT correctly aligned
// copy_words has correct alignment
uint64_t *copy_words = new uint64_t[nwords];
// alias copy_words with a byte pointer; the cast safe because
// uint8_t alignment is less restrictive than uint64_t
uint8_t *copy_bytes = reinterpret_cast<uint8_t *>(copy_words);
// copy everything across
std::copy(ptr, ptr + nbytes, copy_bytes);
// return a unique_ptr with array delete to delete copy_words
auto array_delete = [] (word_tp *p) { delete[] p; };
return word_ptr(copy_words, array_delete);
} else {
// ptr IS correctly aligned
// safe cast because the address of ptr is wordbyte-aligned.
word_tp *same = reinterpret_cast<word_tp *>(ptr);
// don't delete backing array since we don't own it
auto empty_delete = [] (word_tp *) { };
return word_ptr(same, empty_delete);
}
}
extern "C"
{
/**
* Calculate population counts of an array of inputs; return how
* long it took in milliseconds.
*
* 'arrays' must point to narrays*array_bytes bytes
* 'counts' must point to narrays*sizeof(uint32_t) bytes.
* For i = 0 to n - 1, the population count of the array_bytes*8 bits
*
* arrays[i * array_bytes] ... arrays[(i + 1) * array_bytes - 1]
*
* is put in counts[i].
*
* ASSUMES: array_bytes is divisible by 8.
*/
double
popcount_arrays(
uint32_t *counts,
const char *arrays, int narrays, int array_bytes) {
// assumes WORD_BYTES divides array_bytes
int nwords = array_bytes / WORD_BYTES;
// The static_cast is to avoid int overflow in the multiplication
size_t total_bytes = static_cast<size_t>(narrays) * array_bytes;
auto ptr = adjust_ptr_alignment(arrays, total_bytes);
auto u = ptr.get();
clock_t t = clock();
switch (nwords) {
case 32: _popcount_arrays<32>(counts, u, narrays); break;
case 16: _popcount_arrays<16>(counts, u, narrays); break;
case 8: _popcount_arrays< 8>(counts, u, narrays); break;
default:
for (int i = 0; i < narrays; ++i, u += nwords)
counts[i] = _popcount_array(u, nwords);
}
return to_millis(clock() - t);
}
}
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"So the idea is to popcount vectors of 128 byte data as fast as possible. Want to try see if I can do it faster in C++ or Julia..."
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"128"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Some random data to popcount - should expect mean popcount per 128 bytes to be 512.\n",
"using Formatting: printfmt\n",
"narrays = 10\n",
"array_bytes = 128"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"10×128 Array{Int64,2}:\n",
" 2 6 4 4 4 1 3 4 4 7 7 3 4 … 5 2 3 3 4 1 3 4 5 3 4 2\n",
" 5 4 5 1 5 5 4 3 5 3 4 2 5 6 3 5 5 5 6 6 3 5 2 6 3\n",
" 1 5 1 5 4 5 4 4 5 4 4 3 6 1 4 3 5 2 5 1 6 4 5 6 6\n",
" 7 3 6 2 4 5 3 6 5 5 3 4 7 3 3 4 3 3 5 3 2 4 6 3 3\n",
" 5 6 2 2 4 5 4 2 5 3 5 4 2 2 3 3 4 3 6 2 6 4 6 5 4\n",
" 3 3 6 6 4 6 2 3 3 6 3 3 5 … 3 4 4 4 7 5 3 4 6 3 3 2\n",
" 4 3 3 4 8 1 6 6 5 3 4 4 5 5 3 5 7 5 5 4 3 3 4 5 4\n",
" 7 6 7 2 4 4 6 6 4 4 3 4 2 3 5 3 4 5 5 1 3 6 7 3 4\n",
" 4 5 4 2 3 4 3 5 2 4 1 5 5 3 4 3 2 3 5 2 6 4 7 6 5\n",
" 4 4 4 3 4 3 5 5 3 7 6 1 4 2 5 4 3 5 3 2 2 6 2 5 4"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Let's assume our data is in a 2D UInt8 array\n",
"arrays = rand(UInt8, (narrays, array_bytes));\n",
"\n",
"# Perhaps we can count ones of the rows\n",
"count_ones.(arrays)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"10×1 Array{Int64,2}:\n",
" 509\n",
" 519\n",
" 523\n",
" 492\n",
" 516\n",
" 493\n",
" 528\n",
" 524\n",
" 507\n",
" 519"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# That is running popcount on each UInt8 which probably isn't going to be great\n",
"# for performance, but let's see how it plays out.\n",
"\n",
"# We will make a function out of broadcasting `count_ones` for easier benchmarking\n",
"popcount_2d_byte_arrays(arrays::Array{UInt8, 2}) = sum(count_ones.(arrays), dims=2)\n",
"\n",
"popcount_2d_byte_arrays(arrays)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Well that seems to work, let's time it."
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"benchmark (generic function with 1 method)"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"function benchmark(arrays, popcountf, fname, num_bytes)\n",
" num_runs = 100\n",
" # Call it once to ensure function is compiled\n",
" popcountf(arrays)\n",
" total_time = @elapsed for run in 1:num_runs\n",
" counts = popcountf(arrays)\n",
" end\n",
" avg_run_s = total_time/num_runs\n",
"\n",
" println(\"Avg run speed from julia: \", avg_run_s)\n",
"\n",
" bandwidth = num_bytes / (avg_run_s * 2^20)\n",
" printfmt(\"Julia $(fname) popcount measured bandwidth of {:.1f} MiB/s\", bandwidth)\n",
" return avg_run_s\n",
"end"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Avg run speed from julia: 0.04621711034\n",
"Julia 2d byte array popcount measured bandwidth of 264.1 MiB/s"
]
}
],
"source": [
"narrays = 100000\n",
"array_bytes = 128\n",
"arrays_bytes = rand(UInt8, (narrays, array_bytes))\n",
"\n",
"benchmark(arrays_bytes, popcount_2d_byte_arrays, \"2d byte array\", narrays * array_bytes);"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"So we have a starting point - `~270 MiB/s`. Now we saw that we were essentially calling `count_ones` on individual bytes, what would happen if our elements were a bit bigger - say machine word sized? Let's use a `UInt64` instead of a `UInt8`."
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Avg run speed from julia: 0.00291228427\n",
"Julia 2d UInt64 array popcount measured bandwidth of 4191.6 MiB/s"
]
}
],
"source": [
"# Same function definition as before - just different input type.\n",
"popcount_2d_uint64_arrays(arrays::Array{UInt64, 2}) = sum(count_ones.(arrays), dims=2)\n",
"\n",
"# input array now has 8 times less elements.\n",
"arrays_64 = rand(UInt64, (narrays, Int(array_bytes/8)));\n",
"\n",
"benchmark(arrays_64, popcount_2d_uint64_arrays, \"2d UInt64 array\", narrays * array_bytes);"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Well! Now we are talking in `GiB/s`. A good improvement over our first stab too. This is probably because the compiler can take advantage of a popcount instruction that works with a width of 64 bits. We can actually see this by calling `code_native`:"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\t.text\n",
"; Function count_ones {\n",
"; Location: int.jl:358\n",
"\tpopcntq\t%rdi, %rax\n",
"\tretq\n",
"\tnopw\t%cs:(%rax,%rax)\n",
";}\n"
]
}
],
"source": [
"# The count_ones wraps the native popcntq instruction.\n",
"code_native(count_ones, (UInt64,))"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\t.text\n",
"; Function count_ones {\n",
"; Location: int.jl:358\n",
"\tmovzbl\t%dil, %eax\n",
"\tpopcntl\t%eax, %eax\n",
"\tretq\n",
"\tnopl\t(%rax)\n",
";}\n"
]
}
],
"source": [
"# The count_ones wraps the native popcnt instructions\n",
"code_native(count_ones, (UInt8,))"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\t.text\n",
"; Function count_ones {\n",
"; Location: int.jl:358\n",
"\tpopcntq\t%rsi, %rcx\n",
"\tpopcntq\t%rdi, %rax\n",
"\taddq\t%rcx, %rax\n",
"\tretq\n",
"\tnop\n",
";}\n"
]
}
],
"source": [
"# The count_ones wraps the native popcnt instructions\n",
"code_native(count_ones, (UInt128,))"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Avg run speed from julia: 0.00171447827\n",
"Julia 2d UInt128 array popcount measured bandwidth of 7120.0 MiB/s"
]
}
],
"source": [
"# Well the UInt128 instructions are just calling the popcntq twice and adding the result (in assembly)\n",
"# We should see a speed up by using 128bit integers.\n",
"# Same function definition as before - just different input type.\n",
"popcount_2d_uint128_arrays(arrays::Array{UInt128, 2}) = sum(count_ones.(arrays), dims=2)\n",
"\n",
"# input array now has 16 times less elements.\n",
"arrays_128 = rand(UInt128, (narrays, Int(array_bytes/16)));\n",
"\n",
"benchmark(arrays_128, popcount_2d_uint128_arrays, \"2d UInt128 array\", narrays * array_bytes);"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Avg run speed from julia: 0.01657183077\n",
"Julia ntuple of uint64s popcount measured bandwidth of 7366.1 MiB/s"
]
}
],
"source": [
"# Note 128 Bytes is 1024 bits. Which is 16 x 64bits\n",
"# Lets assume that working on 4 x 64bits at a time is best able to be vectorized.\n",
"fourwords = NTuple{4, UInt64}\n",
"\n",
"function fword_popcount(arrays)\n",
" counts = zeros(Int32, size(arrays)[1])\n",
" for i in eachindex(arrays)\n",
" pcount = 0\n",
" for j in 1:4\n",
" pcount += sum(count_ones.(arrays[i][j]))\n",
" end\n",
" counts[i] = pcount\n",
" end\n",
" return counts\n",
"end\n",
"\n",
"make_fword() = [fourwords(rand(UInt64, 4)) for i in 1:4]\n",
"narrays = 1000000\n",
"fdata = [make_fword() for i in 1:narrays]\n",
"\n",
"benchmark(fdata, fword_popcount, \"ntuple of uint64s\", narrays * array_bytes);"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Some increased preformance for our efforts.\n",
"\n",
"Now a more natural datatype for the user might be an array of `BitArray` instances."
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"((1000000,), (1024,))"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"bitarrays = [BitArray(rand(Bool, array_bytes * 8)) for i in 1:narrays];\n",
"size(bitarrays), size(bitarrays[1])"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"534"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Pretty cool that this works\n",
"count(bitarrays[1])"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Avg run speed from julia: 0.01412127291\n",
"Julia broadcast count on BitArray popcount measured bandwidth of 8644.4 MiB/s"
]
}
],
"source": [
"# Just broadcasting the builtin `count` on the bitarray\n",
"popcount_broadcast_count(bitarrays) = count.(bitarrays)\n",
"\n",
"benchmark(bitarrays, popcount_broadcast_count, \"broadcast count on BitArray\", narrays * array_bytes);"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"`~9 GiB/s` using the builtin `count`. Well any half way sensible person would be happy with that."
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"16-element Array{UInt64,1}:\n",
" 0xff35c936649e4b09\n",
" 0x99e0ef02ff3d8dbe\n",
" 0x8e47077312897feb\n",
" 0x4f5e80ccbb130238\n",
" 0xc5e0cffe2acd7174\n",
" 0x5abff90455268611\n",
" 0x2f6645cd50f4d9c0\n",
" 0x57eaaf7e6658b4e0\n",
" 0x2abb4fe469c1dc8a\n",
" 0xffa2383b0676a71e\n",
" 0xbea07f0615cdd6c9\n",
" 0x16fe9c48ede9b493\n",
" 0xa671a5c4e61d3387\n",
" 0xe0fd7542fae48ba1\n",
" 0xc077d70393b29792\n",
" 0x69bdc92b265316d2"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Observe that BitArray instances have a `.chunks` attribute\n",
"# with raw UInt64 elements that we can use to efficiently popcount:\n",
"bitarrays[1].chunks"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"16-element Array{Int64,1}:\n",
" 34\n",
" 38\n",
" 34\n",
" 28\n",
" 36\n",
" 30\n",
" 31\n",
" 36\n",
" 33\n",
" 35\n",
" 34\n",
" 35\n",
" 32\n",
" 34\n",
" 32\n",
" 32"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# So we can broadcast the `count_ones` function across these chunks\n",
"count_ones.(bitarrays[1].chunks)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"534"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# And sum them up\n",
"sum(count_ones.(bitarrays[1].chunks))"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Avg run speed from julia: 0.06992416397000001\n",
"Julia BitArray count_ones of chunks popcount measured bandwidth of 1745.8 MiB/s"
]
}
],
"source": [
"popcount_ba_chunks(chunks::Array{UInt64}) = sum(count_ones.(chunks))\n",
" \n",
"function popcount_sum_count_ones(bitarrays)\n",
" l = size(bitarrays)[1]\n",
" counts = zeros(Int64, l)\n",
" for i in 1:l\n",
" counts[i] = popcount_ba_chunks(bitarrays[i].chunks)\n",
" end\n",
" return counts\n",
"end\n",
"\n",
"benchmark(bitarrays, popcount_sum_count_ones, \"BitArray count_ones of chunks\", narrays * array_bytes);"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Ok so clearly Julia knows a better way to count ones than that. Let's try something different and wrap some C++ code and call that from Julia... The original code can be found on [github](https://github.com/n1analytics/anonlink/blob/master/_cffi_build/dice_one_against_many.cpp#L49).\n",
"\n",
"I'm trying to make the benchmarking fair in that we will still preallocate the output, noting that particular speed up would help all of the methods."
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"popcount (generic function with 2 methods)"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"popcount(arrays::Array{UInt8}, array_bytes::Int64, narrays::Int64) = popcount(zeros(Int32, narrays), arrays, array_bytes, narrays)\n",
"function popcount(counts::Array{Int32}, arrays::Array{UInt8}, array_bytes::Int64, narrays::Int64)\n",
" elapsed_ms = ccall(\n",
" (:popcount_arrays, \"_countingbits.so\"), Cdouble,\n",
" (Ptr{Int32},Ptr{UInt8},Cint,Cint),\n",
" counts, arrays, narrays, array_bytes\n",
" )\n",
" return counts\n",
"end"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Avg run speed from julia: 0.00864360435\n",
"Julia C++ from anonlink popcount measured bandwidth of 14122.6 MiB/s"
]
}
],
"source": [
"arrays = rand(UInt8, narrays * array_bytes)\n",
"f(arrays) = popcount(arrays, array_bytes, narrays)\n",
"\n",
"benchmark(arrays, f, \"C++ from anonlink\", narrays * array_bytes);"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"You have no idea how worried I was that it wouldn't be faster! Still it isn't faster by much. Can you beat it?"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"10-element Array{Tuple{Int64,Int64},1}:\n",
" (504, 504)\n",
" (497, 497)\n",
" (537, 537)\n",
" (496, 496)\n",
" (492, 492)\n",
" (520, 520)\n",
" (505, 505)\n",
" (504, 504)\n",
" (492, 492)\n",
" (507, 507)"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# To check it is correct we convert the memory layout and compare:\n",
"reshaped_2d = reshape(arrays, (narrays, array_bytes))\n",
"\n",
"function convert_memory(x::Array{T, 2}) where T\n",
" ninds = axes(x, 1)\n",
" binds = axes(x, 2)\n",
" \n",
" out = similar(Array{T}, binds, ninds)\n",
" for col = ninds, row = binds\n",
" out[row, col] = x[col, row]\n",
" end\n",
" return out\n",
"end\n",
"\n",
"a = [Int64(i) for i in popcount(convert_memory(reshaped_2d), array_bytes, narrays)]\n",
"b = popcount_2d_byte_arrays(reshaped_2d)\n",
"\n",
"collect(zip(a, b))[1:10]"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"true"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"count(a .== b) == length(a)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Julia 1.0.3",
"language": "julia",
"name": "julia-1.0"
},
"language_info": {
"file_extension": ".jl",
"mimetype": "application/julia",
"name": "julia",
"version": "1.0.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment