Last active
January 13, 2019 11:43
-
-
Save hardbyte/831dc10e1026873a53285f5049d75806 to your computer and use it in GitHub Desktop.
Making sure Julia can count bits.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <memory> | |
#include <functional> | |
#include <algorithm> | |
#include <vector> | |
#include <queue> | |
#include <stdint.h> | |
#include <cstdlib> | |
#include <ctime> | |
#include <cassert> | |
#include <climits> | |
static constexpr int WORD_BYTES = sizeof(uint64_t); | |
/** | |
* The popcount of n elements of buf is the sum of c0, c1, c2, c3. | |
*/ | |
template<int n> | |
static inline void | |
popcount( | |
uint64_t &c0, uint64_t &c1, uint64_t &c2, uint64_t &c3, | |
const uint64_t *buf) { | |
popcount<4>(c0, c1, c2, c3, buf); | |
popcount<n - 4>(c0, c1, c2, c3, buf + 4); | |
} | |
// Fast Path | |
// | |
// Source: http://danluu.com/assembly-intrinsics/ | |
// https://stackoverflow.com/questions/25078285/replacing-a-32-bit-loop-count-variable-with-64-bit-introduces-crazy-performance | |
// | |
// NB: Dan Luu's original assembly (and the SO answer it was based on) | |
// is incorrect because it clobbers registers marked as "input only" | |
// (see warning at | |
// https://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html#InputOperands | |
// -- this mistake does not materialise with GCC (4.9), but it does | |
// with Clang (3.6 and 3.8)). We fix the mistake by explicitly | |
// loading the contents of buf into registers and using these same | |
// registers for the intermediate popcnts. | |
/** | |
* The popcount of 4 elements of buf is the sum of c0, c1, c2, c3. | |
*/ | |
template<> | |
inline void | |
popcount<4>( | |
uint64_t &c0, uint64_t &c1, uint64_t &c2, uint64_t &c3, | |
const uint64_t *buf) { | |
uint64_t b0, b1, b2, b3; | |
b0 = buf[0]; b1 = buf[1]; b2 = buf[2]; b3 = buf[3]; | |
__asm__( | |
"popcnt %4, %4 \n\t" | |
"add %4, %0 \n\t" | |
"popcnt %5, %5 \n\t" | |
"add %5, %1 \n\t" | |
"popcnt %6, %6 \n\t" | |
"add %6, %2 \n\t" | |
"popcnt %7, %7 \n\t" | |
"add %7, %3 \n\t" | |
: "+r" (c0), "+r" (c1), "+r" (c2), "+r" (c3), | |
"+r" (b0), "+r" (b1), "+r" (b2), "+r" (b3)); | |
} | |
// Slow paths | |
// TODO: Assumes sizeof(long) == 8 | |
// | |
// NB: The specialisation to n=3 is not currently used but included | |
// for completeness (i.e. so that popcount<n> is defined for all | |
// non-negative n) and in anticipation of its use in the near future. | |
#if 0 | |
template<> | |
inline void | |
popcount<3>( | |
uint64_t &c0, uint64_t &c1, uint64_t &c2, uint64_t &, | |
const uint64_t *buf) { | |
c0 += __builtin_popcountl(buf[0]); | |
c1 += __builtin_popcountl(buf[1]); | |
c2 += __builtin_popcountl(buf[2]); | |
} | |
#endif | |
/** | |
* The popcount of 2 elements of buf is the sum of c0, c1. | |
*/ | |
template<> | |
inline void | |
popcount<2>( | |
uint64_t &c0, uint64_t &c1, uint64_t &, uint64_t &, | |
const uint64_t *buf) { | |
c0 += __builtin_popcountl(buf[0]); | |
c1 += __builtin_popcountl(buf[1]); | |
} | |
/** | |
* The popcount *buf is in c0. | |
*/ | |
template<> | |
inline void | |
popcount<1>( | |
uint64_t &c0, uint64_t &, uint64_t &, uint64_t &, | |
const uint64_t *buf) { | |
c0 += __builtin_popcountl(buf[0]); | |
} | |
/** | |
* Calculate population counts of an array of inputs of nwords elements. | |
* | |
* 'arrays' must point to narrays*nwords*WORD_BYTES bytes | |
* 'counts' must point to narrays*sizeof(uint32_t) bytes. | |
* For i = 0 to narrays - 1, the population count of the nwords elements | |
* | |
* arrays[i * nwords] ... arrays[(i + 1) * nwords - 1] | |
* | |
* is put in counts[i]. | |
*/ | |
template<int nwords> | |
static void | |
_popcount_arrays(uint32_t *counts, const uint64_t *arrays, int narrays) { | |
uint64_t c0, c1, c2, c3; | |
for (int i = 0; i < narrays; ++i, arrays += nwords) { | |
c0 = c1 = c2 = c3 = 0; | |
popcount<nwords>(c0, c1, c2, c3, arrays); | |
counts[i] = c0 + c1 + c2 + c3; | |
} | |
} | |
/** | |
* Return the popcount of the nwords elements starting at array. | |
*/ | |
static uint32_t | |
_popcount_array(const uint64_t *array, int nwords) { | |
uint64_t c0, c1, c2, c3; | |
c0 = c1 = c2 = c3 = 0; | |
while (nwords >= 16) { | |
popcount<16>(c0, c1, c2, c3, array); | |
array += 16; | |
nwords -= 16; | |
} | |
// nwords < 16 | |
if (nwords >= 8) { | |
popcount<8>(c0, c1, c2, c3, array); | |
array += 8; | |
nwords -= 8; | |
} | |
// nwords < 8 | |
if (nwords >= 4) { | |
popcount<4>(c0, c1, c2, c3, array); | |
array += 4; | |
nwords -= 4; | |
} | |
// nwords < 4 | |
if (nwords >= 2) { | |
popcount<2>(c0, c1, c2, c3, array); | |
array += 2; | |
nwords -= 2; | |
} | |
// nwords < 2 | |
if (nwords == 1) | |
popcount<1>(c0, c1, c2, c3, array); | |
return c0 + c1 + c2 + c3; | |
} | |
/** | |
* Convert clock measurement t to milliseconds. | |
* | |
* t should have been obtained as the difference of calls to clock() | |
* for this to make sense. | |
*/ | |
static inline double | |
to_millis(clock_t t) { | |
static constexpr double CPS = (double)CLOCKS_PER_SEC; | |
return t * 1.0E3 / CPS; | |
} | |
static inline uint32_t | |
abs_diff(uint32_t a, uint32_t b) { | |
if (a > b) | |
return a - b; | |
return b - a; | |
} | |
typedef const uint64_t word_tp; | |
typedef std::function<void (word_tp *)> deleter_fn; | |
typedef std::unique_ptr<word_tp, deleter_fn> word_ptr; | |
/** | |
* Given a char pointer ptr pointing to nbytes bytes, return a | |
* std::unique_ptr to uint64_t containing those same byte values | |
* (using the host's byte order, i.e. no byte reordering is done). | |
* | |
* The main purpose of this function is to fix pointer alignment | |
* issues when converting from a char pointer to a uint64 pointer; the | |
* issue being that a uint64 pointer has stricter alignment | |
* requirements than a char pointer. | |
* | |
* We assume that releasing the memory associated with ptr is someone | |
* else's problem. So, if ptr is already correctly aligned, we just | |
* cast it to uint64 and return a unique_ptr whose deleter is | |
* empty. If ptr is misaligned, then we copy the nbytes at ptr to a | |
* location that is correctly aligned and then return a unique_ptr | |
* whose deleter will delete that allocated space when the unique_ptr | |
* goes out of scope. | |
* | |
* NB: ASSUMES nbytes is divisible by WORD_BYTES. | |
* | |
* TODO: On some architectures it might be advantageous to have | |
* 16-byte aligned memory, even when the words used are only 8 bytes | |
* (this is to do with cache line size). This function could be easily | |
* modified to allow experimentation with different alignments. | |
*/ | |
static word_ptr | |
adjust_ptr_alignment(const char *ptr, size_t nbytes) { | |
static constexpr size_t wordbytes = sizeof(word_tp); | |
size_t nwords = nbytes / wordbytes; | |
uintptr_t addr = reinterpret_cast<uintptr_t>(ptr); | |
// ptr is correctly aligned if addr is divisible by wordbytes | |
if (addr % wordbytes != 0) { | |
// ptr is NOT correctly aligned | |
// copy_words has correct alignment | |
uint64_t *copy_words = new uint64_t[nwords]; | |
// alias copy_words with a byte pointer; the cast safe because | |
// uint8_t alignment is less restrictive than uint64_t | |
uint8_t *copy_bytes = reinterpret_cast<uint8_t *>(copy_words); | |
// copy everything across | |
std::copy(ptr, ptr + nbytes, copy_bytes); | |
// return a unique_ptr with array delete to delete copy_words | |
auto array_delete = [] (word_tp *p) { delete[] p; }; | |
return word_ptr(copy_words, array_delete); | |
} else { | |
// ptr IS correctly aligned | |
// safe cast because the address of ptr is wordbyte-aligned. | |
word_tp *same = reinterpret_cast<word_tp *>(ptr); | |
// don't delete backing array since we don't own it | |
auto empty_delete = [] (word_tp *) { }; | |
return word_ptr(same, empty_delete); | |
} | |
} | |
extern "C" | |
{ | |
/** | |
* Calculate population counts of an array of inputs; return how | |
* long it took in milliseconds. | |
* | |
* 'arrays' must point to narrays*array_bytes bytes | |
* 'counts' must point to narrays*sizeof(uint32_t) bytes. | |
* For i = 0 to n - 1, the population count of the array_bytes*8 bits | |
* | |
* arrays[i * array_bytes] ... arrays[(i + 1) * array_bytes - 1] | |
* | |
* is put in counts[i]. | |
* | |
* ASSUMES: array_bytes is divisible by 8. | |
*/ | |
double | |
popcount_arrays( | |
uint32_t *counts, | |
const char *arrays, int narrays, int array_bytes) { | |
// assumes WORD_BYTES divides array_bytes | |
int nwords = array_bytes / WORD_BYTES; | |
// The static_cast is to avoid int overflow in the multiplication | |
size_t total_bytes = static_cast<size_t>(narrays) * array_bytes; | |
auto ptr = adjust_ptr_alignment(arrays, total_bytes); | |
auto u = ptr.get(); | |
clock_t t = clock(); | |
switch (nwords) { | |
case 32: _popcount_arrays<32>(counts, u, narrays); break; | |
case 16: _popcount_arrays<16>(counts, u, narrays); break; | |
case 8: _popcount_arrays< 8>(counts, u, narrays); break; | |
default: | |
for (int i = 0; i < narrays; ++i, u += nwords) | |
counts[i] = _popcount_array(u, nwords); | |
} | |
return to_millis(clock() - t); | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"So the idea is to popcount vectors of 128 byte data as fast as possible. Want to try see if I can do it faster in C++ or Julia..." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"128" | |
] | |
}, | |
"execution_count": 1, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# Some random data to popcount - should expect mean popcount per 128 bytes to be 512.\n", | |
"using Formatting: printfmt\n", | |
"narrays = 10\n", | |
"array_bytes = 128" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"10×128 Array{Int64,2}:\n", | |
" 2 6 4 4 4 1 3 4 4 7 7 3 4 … 5 2 3 3 4 1 3 4 5 3 4 2\n", | |
" 5 4 5 1 5 5 4 3 5 3 4 2 5 6 3 5 5 5 6 6 3 5 2 6 3\n", | |
" 1 5 1 5 4 5 4 4 5 4 4 3 6 1 4 3 5 2 5 1 6 4 5 6 6\n", | |
" 7 3 6 2 4 5 3 6 5 5 3 4 7 3 3 4 3 3 5 3 2 4 6 3 3\n", | |
" 5 6 2 2 4 5 4 2 5 3 5 4 2 2 3 3 4 3 6 2 6 4 6 5 4\n", | |
" 3 3 6 6 4 6 2 3 3 6 3 3 5 … 3 4 4 4 7 5 3 4 6 3 3 2\n", | |
" 4 3 3 4 8 1 6 6 5 3 4 4 5 5 3 5 7 5 5 4 3 3 4 5 4\n", | |
" 7 6 7 2 4 4 6 6 4 4 3 4 2 3 5 3 4 5 5 1 3 6 7 3 4\n", | |
" 4 5 4 2 3 4 3 5 2 4 1 5 5 3 4 3 2 3 5 2 6 4 7 6 5\n", | |
" 4 4 4 3 4 3 5 5 3 7 6 1 4 2 5 4 3 5 3 2 2 6 2 5 4" | |
] | |
}, | |
"execution_count": 2, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# Let's assume our data is in a 2D UInt8 array\n", | |
"arrays = rand(UInt8, (narrays, array_bytes));\n", | |
"\n", | |
"# Perhaps we can count ones of the rows\n", | |
"count_ones.(arrays)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"10×1 Array{Int64,2}:\n", | |
" 509\n", | |
" 519\n", | |
" 523\n", | |
" 492\n", | |
" 516\n", | |
" 493\n", | |
" 528\n", | |
" 524\n", | |
" 507\n", | |
" 519" | |
] | |
}, | |
"execution_count": 3, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# That is running popcount on each UInt8 which probably isn't going to be great\n", | |
"# for performance, but let's see how it plays out.\n", | |
"\n", | |
"# We will make a function out of broadcasting `count_ones` for easier benchmarking\n", | |
"popcount_2d_byte_arrays(arrays::Array{UInt8, 2}) = sum(count_ones.(arrays), dims=2)\n", | |
"\n", | |
"popcount_2d_byte_arrays(arrays)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Well that seems to work, let's time it." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"benchmark (generic function with 1 method)" | |
] | |
}, | |
"execution_count": 4, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"function benchmark(arrays, popcountf, fname, num_bytes)\n", | |
" num_runs = 100\n", | |
" # Call it once to ensure function is compiled\n", | |
" popcountf(arrays)\n", | |
" total_time = @elapsed for run in 1:num_runs\n", | |
" counts = popcountf(arrays)\n", | |
" end\n", | |
" avg_run_s = total_time/num_runs\n", | |
"\n", | |
" println(\"Avg run speed from julia: \", avg_run_s)\n", | |
"\n", | |
" bandwidth = num_bytes / (avg_run_s * 2^20)\n", | |
" printfmt(\"Julia $(fname) popcount measured bandwidth of {:.1f} MiB/s\", bandwidth)\n", | |
" return avg_run_s\n", | |
"end" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Avg run speed from julia: 0.04621711034\n", | |
"Julia 2d byte array popcount measured bandwidth of 264.1 MiB/s" | |
] | |
} | |
], | |
"source": [ | |
"narrays = 100000\n", | |
"array_bytes = 128\n", | |
"arrays_bytes = rand(UInt8, (narrays, array_bytes))\n", | |
"\n", | |
"benchmark(arrays_bytes, popcount_2d_byte_arrays, \"2d byte array\", narrays * array_bytes);" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"So we have a starting point - `~270 MiB/s`. Now we saw that we were essentially calling `count_ones` on individual bytes, what would happen if our elements were a bit bigger - say machine word sized? Let's use a `UInt64` instead of a `UInt8`." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Avg run speed from julia: 0.00291228427\n", | |
"Julia 2d UInt64 array popcount measured bandwidth of 4191.6 MiB/s" | |
] | |
} | |
], | |
"source": [ | |
"# Same function definition as before - just different input type.\n", | |
"popcount_2d_uint64_arrays(arrays::Array{UInt64, 2}) = sum(count_ones.(arrays), dims=2)\n", | |
"\n", | |
"# input array now has 8 times less elements.\n", | |
"arrays_64 = rand(UInt64, (narrays, Int(array_bytes/8)));\n", | |
"\n", | |
"benchmark(arrays_64, popcount_2d_uint64_arrays, \"2d UInt64 array\", narrays * array_bytes);" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Well! Now we are talking in `GiB/s`. A good improvement over our first stab too. This is probably because the compiler can take advantage of a popcount instruction that works with a width of 64 bits. We can actually see this by calling `code_native`:" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"\t.text\n", | |
"; Function count_ones {\n", | |
"; Location: int.jl:358\n", | |
"\tpopcntq\t%rdi, %rax\n", | |
"\tretq\n", | |
"\tnopw\t%cs:(%rax,%rax)\n", | |
";}\n" | |
] | |
} | |
], | |
"source": [ | |
"# The count_ones wraps the native popcntq instruction.\n", | |
"code_native(count_ones, (UInt64,))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"\t.text\n", | |
"; Function count_ones {\n", | |
"; Location: int.jl:358\n", | |
"\tmovzbl\t%dil, %eax\n", | |
"\tpopcntl\t%eax, %eax\n", | |
"\tretq\n", | |
"\tnopl\t(%rax)\n", | |
";}\n" | |
] | |
} | |
], | |
"source": [ | |
"# The count_ones wraps the native popcnt instructions\n", | |
"code_native(count_ones, (UInt8,))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"\t.text\n", | |
"; Function count_ones {\n", | |
"; Location: int.jl:358\n", | |
"\tpopcntq\t%rsi, %rcx\n", | |
"\tpopcntq\t%rdi, %rax\n", | |
"\taddq\t%rcx, %rax\n", | |
"\tretq\n", | |
"\tnop\n", | |
";}\n" | |
] | |
} | |
], | |
"source": [ | |
"# The count_ones wraps the native popcnt instructions\n", | |
"code_native(count_ones, (UInt128,))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Avg run speed from julia: 0.00171447827\n", | |
"Julia 2d UInt128 array popcount measured bandwidth of 7120.0 MiB/s" | |
] | |
} | |
], | |
"source": [ | |
"# Well the UInt128 instructions are just calling the popcntq twice and adding the result (in assembly)\n", | |
"# We should see a speed up by using 128bit integers.\n", | |
"# Same function definition as before - just different input type.\n", | |
"popcount_2d_uint128_arrays(arrays::Array{UInt128, 2}) = sum(count_ones.(arrays), dims=2)\n", | |
"\n", | |
"# input array now has 16 times less elements.\n", | |
"arrays_128 = rand(UInt128, (narrays, Int(array_bytes/16)));\n", | |
"\n", | |
"benchmark(arrays_128, popcount_2d_uint128_arrays, \"2d UInt128 array\", narrays * array_bytes);" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Avg run speed from julia: 0.01657183077\n", | |
"Julia ntuple of uint64s popcount measured bandwidth of 7366.1 MiB/s" | |
] | |
} | |
], | |
"source": [ | |
"# Note 128 Bytes is 1024 bits. Which is 16 x 64bits\n", | |
"# Lets assume that working on 4 x 64bits at a time is best able to be vectorized.\n", | |
"fourwords = NTuple{4, UInt64}\n", | |
"\n", | |
"function fword_popcount(arrays)\n", | |
" counts = zeros(Int32, size(arrays)[1])\n", | |
" for i in eachindex(arrays)\n", | |
" pcount = 0\n", | |
" for j in 1:4\n", | |
" pcount += sum(count_ones.(arrays[i][j]))\n", | |
" end\n", | |
" counts[i] = pcount\n", | |
" end\n", | |
" return counts\n", | |
"end\n", | |
"\n", | |
"make_fword() = [fourwords(rand(UInt64, 4)) for i in 1:4]\n", | |
"narrays = 1000000\n", | |
"fdata = [make_fword() for i in 1:narrays]\n", | |
"\n", | |
"benchmark(fdata, fword_popcount, \"ntuple of uint64s\", narrays * array_bytes);" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Some increased preformance for our efforts.\n", | |
"\n", | |
"Now a more natural datatype for the user might be an array of `BitArray` instances." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"((1000000,), (1024,))" | |
] | |
}, | |
"execution_count": 12, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"bitarrays = [BitArray(rand(Bool, array_bytes * 8)) for i in 1:narrays];\n", | |
"size(bitarrays), size(bitarrays[1])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"534" | |
] | |
}, | |
"execution_count": 13, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# Pretty cool that this works\n", | |
"count(bitarrays[1])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Avg run speed from julia: 0.01412127291\n", | |
"Julia broadcast count on BitArray popcount measured bandwidth of 8644.4 MiB/s" | |
] | |
} | |
], | |
"source": [ | |
"# Just broadcasting the builtin `count` on the bitarray\n", | |
"popcount_broadcast_count(bitarrays) = count.(bitarrays)\n", | |
"\n", | |
"benchmark(bitarrays, popcount_broadcast_count, \"broadcast count on BitArray\", narrays * array_bytes);" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"`~9 GiB/s` using the builtin `count`. Well any half way sensible person would be happy with that." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 15, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"16-element Array{UInt64,1}:\n", | |
" 0xff35c936649e4b09\n", | |
" 0x99e0ef02ff3d8dbe\n", | |
" 0x8e47077312897feb\n", | |
" 0x4f5e80ccbb130238\n", | |
" 0xc5e0cffe2acd7174\n", | |
" 0x5abff90455268611\n", | |
" 0x2f6645cd50f4d9c0\n", | |
" 0x57eaaf7e6658b4e0\n", | |
" 0x2abb4fe469c1dc8a\n", | |
" 0xffa2383b0676a71e\n", | |
" 0xbea07f0615cdd6c9\n", | |
" 0x16fe9c48ede9b493\n", | |
" 0xa671a5c4e61d3387\n", | |
" 0xe0fd7542fae48ba1\n", | |
" 0xc077d70393b29792\n", | |
" 0x69bdc92b265316d2" | |
] | |
}, | |
"execution_count": 15, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# Observe that BitArray instances have a `.chunks` attribute\n", | |
"# with raw UInt64 elements that we can use to efficiently popcount:\n", | |
"bitarrays[1].chunks" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 16, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"16-element Array{Int64,1}:\n", | |
" 34\n", | |
" 38\n", | |
" 34\n", | |
" 28\n", | |
" 36\n", | |
" 30\n", | |
" 31\n", | |
" 36\n", | |
" 33\n", | |
" 35\n", | |
" 34\n", | |
" 35\n", | |
" 32\n", | |
" 34\n", | |
" 32\n", | |
" 32" | |
] | |
}, | |
"execution_count": 16, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# So we can broadcast the `count_ones` function across these chunks\n", | |
"count_ones.(bitarrays[1].chunks)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 17, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"534" | |
] | |
}, | |
"execution_count": 17, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# And sum them up\n", | |
"sum(count_ones.(bitarrays[1].chunks))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 23, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Avg run speed from julia: 0.06992416397000001\n", | |
"Julia BitArray count_ones of chunks popcount measured bandwidth of 1745.8 MiB/s" | |
] | |
} | |
], | |
"source": [ | |
"popcount_ba_chunks(chunks::Array{UInt64}) = sum(count_ones.(chunks))\n", | |
" \n", | |
"function popcount_sum_count_ones(bitarrays)\n", | |
" l = size(bitarrays)[1]\n", | |
" counts = zeros(Int64, l)\n", | |
" for i in 1:l\n", | |
" counts[i] = popcount_ba_chunks(bitarrays[i].chunks)\n", | |
" end\n", | |
" return counts\n", | |
"end\n", | |
"\n", | |
"benchmark(bitarrays, popcount_sum_count_ones, \"BitArray count_ones of chunks\", narrays * array_bytes);" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Ok so clearly Julia knows a better way to count ones than that. Let's try something different and wrap some C++ code and call that from Julia... The original code can be found on [github](https://github.com/n1analytics/anonlink/blob/master/_cffi_build/dice_one_against_many.cpp#L49).\n", | |
"\n", | |
"I'm trying to make the benchmarking fair in that we will still preallocate the output, noting that particular speed up would help all of the methods." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 19, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"popcount (generic function with 2 methods)" | |
] | |
}, | |
"execution_count": 19, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"popcount(arrays::Array{UInt8}, array_bytes::Int64, narrays::Int64) = popcount(zeros(Int32, narrays), arrays, array_bytes, narrays)\n", | |
"function popcount(counts::Array{Int32}, arrays::Array{UInt8}, array_bytes::Int64, narrays::Int64)\n", | |
" elapsed_ms = ccall(\n", | |
" (:popcount_arrays, \"_countingbits.so\"), Cdouble,\n", | |
" (Ptr{Int32},Ptr{UInt8},Cint,Cint),\n", | |
" counts, arrays, narrays, array_bytes\n", | |
" )\n", | |
" return counts\n", | |
"end" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 20, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Avg run speed from julia: 0.00864360435\n", | |
"Julia C++ from anonlink popcount measured bandwidth of 14122.6 MiB/s" | |
] | |
} | |
], | |
"source": [ | |
"arrays = rand(UInt8, narrays * array_bytes)\n", | |
"f(arrays) = popcount(arrays, array_bytes, narrays)\n", | |
"\n", | |
"benchmark(arrays, f, \"C++ from anonlink\", narrays * array_bytes);" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"You have no idea how worried I was that it wouldn't be faster! Still it isn't faster by much. Can you beat it?" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 21, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"10-element Array{Tuple{Int64,Int64},1}:\n", | |
" (504, 504)\n", | |
" (497, 497)\n", | |
" (537, 537)\n", | |
" (496, 496)\n", | |
" (492, 492)\n", | |
" (520, 520)\n", | |
" (505, 505)\n", | |
" (504, 504)\n", | |
" (492, 492)\n", | |
" (507, 507)" | |
] | |
}, | |
"execution_count": 21, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# To check it is correct we convert the memory layout and compare:\n", | |
"reshaped_2d = reshape(arrays, (narrays, array_bytes))\n", | |
"\n", | |
"function convert_memory(x::Array{T, 2}) where T\n", | |
" ninds = axes(x, 1)\n", | |
" binds = axes(x, 2)\n", | |
" \n", | |
" out = similar(Array{T}, binds, ninds)\n", | |
" for col = ninds, row = binds\n", | |
" out[row, col] = x[col, row]\n", | |
" end\n", | |
" return out\n", | |
"end\n", | |
"\n", | |
"a = [Int64(i) for i in popcount(convert_memory(reshaped_2d), array_bytes, narrays)]\n", | |
"b = popcount_2d_byte_arrays(reshaped_2d)\n", | |
"\n", | |
"collect(zip(a, b))[1:10]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 22, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"true" | |
] | |
}, | |
"execution_count": 22, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"count(a .== b) == length(a)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Julia 1.0.3", | |
"language": "julia", | |
"name": "julia-1.0" | |
}, | |
"language_info": { | |
"file_extension": ".jl", | |
"mimetype": "application/julia", | |
"name": "julia", | |
"version": "1.0.3" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment