hardbyte/Julia Popcount.ipynb

## _countingbits.so

      
    Raw
  

              _countingbits.so
            
          
            View raw
        
    
## countingbits.cpp
#include <memory>
#include <functional>
#include <algorithm>
#include <vector>
#include <queue>
#include <stdint.h>
#include <cstdlib>
#include <ctime>
#include <cassert>
#include <climits>

static constexpr int WORD_BYTES = sizeof(uint64_t);

/**
 * The popcount of n elements of buf is the sum of c0, c1, c2, c3.
 */
template<int n>
static inline void
popcount(
        uint64_t &c0, uint64_t &c1, uint64_t &c2, uint64_t &c3,
        const uint64_t *buf) {
    popcount<4>(c0, c1, c2, c3, buf);
    popcount<n - 4>(c0, c1, c2, c3, buf + 4);
}

// Fast Path
//
// Source: http://danluu.com/assembly-intrinsics/
// https://stackoverflow.com/questions/25078285/replacing-a-32-bit-loop-count-variable-with-64-bit-introduces-crazy-performance
//
// NB: Dan Luu's original assembly (and the SO answer it was based on)
// is incorrect because it clobbers registers marked as "input only"
// (see warning at
// https://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html#InputOperands
// -- this mistake does not materialise with GCC (4.9), but it does
// with Clang (3.6 and 3.8)).  We fix the mistake by explicitly
// loading the contents of buf into registers and using these same
// registers for the intermediate popcnts.
/**
 * The popcount of 4 elements of buf is the sum of c0, c1, c2, c3.
 */
template<>
inline void
popcount<4>(
        uint64_t &c0, uint64_t &c1, uint64_t &c2, uint64_t &c3,
        const uint64_t *buf) {
    uint64_t b0, b1, b2, b3;
    b0 = buf[0]; b1 = buf[1]; b2 = buf[2]; b3 = buf[3];
    __asm__(
        "popcnt %4, %4  \n\t"
        "add %4, %0     \n\t"
        "popcnt %5, %5  \n\t"
        "add %5, %1     \n\t"
        "popcnt %6, %6  \n\t"
        "add %6, %2     \n\t"
        "popcnt %7, %7  \n\t"
        "add %7, %3     \n\t"
        : "+r" (c0), "+r" (c1), "+r" (c2), "+r" (c3),
          "+r" (b0), "+r" (b1), "+r" (b2), "+r" (b3));
}

// Slow paths
// TODO: Assumes sizeof(long) == 8
//
// NB: The specialisation to n=3 is not currently used but included
// for completeness (i.e. so that popcount<n> is defined for all
// non-negative n) and in anticipation of its use in the near future.
#if 0
template<>
inline void
popcount<3>(
        uint64_t &c0, uint64_t &c1, uint64_t &c2, uint64_t &,
        const uint64_t *buf) {
    c0 += __builtin_popcountl(buf[0]);
    c1 += __builtin_popcountl(buf[1]);
    c2 += __builtin_popcountl(buf[2]);
}
#endif

/**
 * The popcount of 2 elements of buf is the sum of c0, c1.
 */
template<>
inline void
popcount<2>(
        uint64_t &c0, uint64_t &c1, uint64_t &, uint64_t &,
        const uint64_t *buf) {
    c0 += __builtin_popcountl(buf[0]);
    c1 += __builtin_popcountl(buf[1]);
}

/**
 * The popcount *buf is in c0.
 */
template<>
inline void
popcount<1>(
        uint64_t &c0, uint64_t &, uint64_t &, uint64_t &,
        const uint64_t *buf) {
    c0 += __builtin_popcountl(buf[0]);
}


/**
 * Calculate population counts of an array of inputs of nwords elements.
 *
 * 'arrays' must point to narrays*nwords*WORD_BYTES bytes
 * 'counts' must point to narrays*sizeof(uint32_t) bytes.
 * For i = 0 to narrays - 1, the population count of the nwords elements
 *
 *   arrays[i * nwords] ... arrays[(i + 1) * nwords - 1]
 *
 * is put in counts[i].
 */
template<int nwords>
static void
_popcount_arrays(uint32_t *counts, const uint64_t *arrays, int narrays) {
    uint64_t c0, c1, c2, c3;
    for (int i = 0; i < narrays; ++i, arrays += nwords) {
        c0 = c1 = c2 = c3 = 0;
        popcount<nwords>(c0, c1, c2, c3, arrays);
        counts[i] = c0 + c1 + c2 + c3;
    }
}

/**
 * Return the popcount of the nwords elements starting at array.
 */
static uint32_t
_popcount_array(const uint64_t *array, int nwords) {
    uint64_t c0, c1, c2, c3;
    c0 = c1 = c2 = c3 = 0;

    while (nwords >= 16) {
        popcount<16>(c0, c1, c2, c3, array);
        array += 16;
        nwords -= 16;
    }
    // nwords < 16
    if (nwords >= 8) {
        popcount<8>(c0, c1, c2, c3, array);
        array += 8;
        nwords -= 8;
    }
    // nwords < 8
    if (nwords >= 4) {
        popcount<4>(c0, c1, c2, c3, array);
        array += 4;
        nwords -= 4;
    }
    // nwords < 4
    if (nwords >= 2) {
        popcount<2>(c0, c1, c2, c3, array);
        array += 2;
        nwords -= 2;
    }
    // nwords < 2
    if (nwords == 1)
        popcount<1>(c0, c1, c2, c3, array);
    return c0 + c1 + c2 + c3;
}

/**
 * Convert clock measurement t to milliseconds.
 *
 * t should have been obtained as the difference of calls to clock()
 * for this to make sense.
 */
static inline double
to_millis(clock_t t) {
    static constexpr double CPS = (double)CLOCKS_PER_SEC;
    return t * 1.0E3 / CPS;
}

static inline uint32_t
abs_diff(uint32_t a, uint32_t b) {
    if (a > b)
        return a - b;
    return b - a;
}


typedef const uint64_t word_tp;
typedef std::function<void (word_tp *)> deleter_fn;
typedef std::unique_ptr<word_tp, deleter_fn> word_ptr;

/**
 * Given a char pointer ptr pointing to nbytes bytes, return a
 * std::unique_ptr to uint64_t containing those same byte values
 * (using the host's byte order, i.e. no byte reordering is done).
 *
 * The main purpose of this function is to fix pointer alignment
 * issues when converting from a char pointer to a uint64 pointer; the
 * issue being that a uint64 pointer has stricter alignment
 * requirements than a char pointer.
 *
 * We assume that releasing the memory associated with ptr is someone
 * else's problem. So, if ptr is already correctly aligned, we just
 * cast it to uint64 and return a unique_ptr whose deleter is
 * empty. If ptr is misaligned, then we copy the nbytes at ptr to a
 * location that is correctly aligned and then return a unique_ptr
 * whose deleter will delete that allocated space when the unique_ptr
 * goes out of scope.
 *
 * NB: ASSUMES nbytes is divisible by WORD_BYTES.
 *
 * TODO: On some architectures it might be advantageous to have
 * 16-byte aligned memory, even when the words used are only 8 bytes
 * (this is to do with cache line size). This function could be easily
 * modified to allow experimentation with different alignments.
 */
static word_ptr
adjust_ptr_alignment(const char *ptr, size_t nbytes) {
    static constexpr size_t wordbytes = sizeof(word_tp);
    size_t nwords = nbytes / wordbytes;
    uintptr_t addr = reinterpret_cast<uintptr_t>(ptr);
    // ptr is correctly aligned if addr is divisible by wordbytes
    if (addr % wordbytes != 0) {
        // ptr is NOT correctly aligned

        // copy_words has correct alignment
        uint64_t *copy_words = new uint64_t[nwords];
        // alias copy_words with a byte pointer; the cast safe because
        // uint8_t alignment is less restrictive than uint64_t
        uint8_t *copy_bytes = reinterpret_cast<uint8_t *>(copy_words);
        // copy everything across
        std::copy(ptr, ptr + nbytes, copy_bytes);
        // return a unique_ptr with array delete to delete copy_words
        auto array_delete = [] (word_tp *p) { delete[] p; };
        return word_ptr(copy_words, array_delete);
    } else {
        // ptr IS correctly aligned

        // safe cast because the address of ptr is wordbyte-aligned.
        word_tp *same = reinterpret_cast<word_tp *>(ptr);
        // don't delete backing array since we don't own it
        auto empty_delete = [] (word_tp *) { };
        return word_ptr(same, empty_delete);
    }
}


extern "C"
{
    /**
     * Calculate population counts of an array of inputs; return how
     * long it took in milliseconds.
     *
     * 'arrays' must point to narrays*array_bytes bytes
     * 'counts' must point to narrays*sizeof(uint32_t) bytes.
     * For i = 0 to n - 1, the population count of the array_bytes*8 bits
     *
     *   arrays[i * array_bytes] ... arrays[(i + 1) * array_bytes - 1]
     *
     * is put in counts[i].
     *
     * ASSUMES: array_bytes is divisible by 8.
     */
    double
    popcount_arrays(
            uint32_t *counts,
            const char *arrays, int narrays, int array_bytes) {
        // assumes WORD_BYTES divides array_bytes
        int nwords = array_bytes / WORD_BYTES;
        // The static_cast is to avoid int overflow in the multiplication
        size_t total_bytes = static_cast<size_t>(narrays) * array_bytes;
        auto ptr = adjust_ptr_alignment(arrays, total_bytes);
        auto u = ptr.get();

        clock_t t = clock();
        switch (nwords) {
        case 32: _popcount_arrays<32>(counts, u, narrays); break;
        case 16: _popcount_arrays<16>(counts, u, narrays); break;
        case  8: _popcount_arrays< 8>(counts, u, narrays); break;
        default:
            for (int i = 0; i < narrays; ++i, u += nwords)
                counts[i] = _popcount_array(u, nwords);
        }
        return to_millis(clock() - t);
    }
}

## Julia Popcount.ipynb
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "So the idea is to popcount vectors of 128 byte data as fast as possible. Want to try see if I can do it faster in C++ or Julia..."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "128"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Some random data to popcount - should expect mean popcount per 128 bytes to be 512.\n",
    "using Formatting: printfmt\n",
    "narrays = 10\n",
    "array_bytes = 128"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "10×128 Array{Int64,2}:\n",
       " 2  6  4  4  4  1  3  4  4  7  7  3  4  …  5  2  3  3  4  1  3  4  5  3  4  2\n",
       " 5  4  5  1  5  5  4  3  5  3  4  2  5     6  3  5  5  5  6  6  3  5  2  6  3\n",
       " 1  5  1  5  4  5  4  4  5  4  4  3  6     1  4  3  5  2  5  1  6  4  5  6  6\n",
       " 7  3  6  2  4  5  3  6  5  5  3  4  7     3  3  4  3  3  5  3  2  4  6  3  3\n",
       " 5  6  2  2  4  5  4  2  5  3  5  4  2     2  3  3  4  3  6  2  6  4  6  5  4\n",
       " 3  3  6  6  4  6  2  3  3  6  3  3  5  …  3  4  4  4  7  5  3  4  6  3  3  2\n",
       " 4  3  3  4  8  1  6  6  5  3  4  4  5     5  3  5  7  5  5  4  3  3  4  5  4\n",
       " 7  6  7  2  4  4  6  6  4  4  3  4  2     3  5  3  4  5  5  1  3  6  7  3  4\n",
       " 4  5  4  2  3  4  3  5  2  4  1  5  5     3  4  3  2  3  5  2  6  4  7  6  5\n",
       " 4  4  4  3  4  3  5  5  3  7  6  1  4     2  5  4  3  5  3  2  2  6  2  5  4"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Let's assume our data is in a 2D UInt8 array\n",
    "arrays = rand(UInt8, (narrays, array_bytes));\n",
    "\n",
    "# Perhaps we can count ones of the rows\n",
    "count_ones.(arrays)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "10×1 Array{Int64,2}:\n",
       " 509\n",
       " 519\n",
       " 523\n",
       " 492\n",
       " 516\n",
       " 493\n",
       " 528\n",
       " 524\n",
       " 507\n",
       " 519"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# That is running popcount on each UInt8 which probably isn't going to be great\n",
    "# for performance, but let's see how it plays out.\n",
    "\n",
    "# We will make a function out of broadcasting `count_ones` for easier benchmarking\n",
    "popcount_2d_byte_arrays(arrays::Array{UInt8, 2}) = sum(count_ones.(arrays), dims=2)\n",
    "\n",
    "popcount_2d_byte_arrays(arrays)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Well that seems to work, let's time it."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "benchmark (generic function with 1 method)"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "function benchmark(arrays, popcountf, fname, num_bytes)\n",
    "    num_runs = 100\n",
    "    # Call it once to ensure function is compiled\n",
    "    popcountf(arrays)\n",
    "    total_time = @elapsed for run in 1:num_runs\n",
    "        counts = popcountf(arrays)\n",
    "    end\n",
    "    avg_run_s = total_time/num_runs\n",
    "\n",
    "    println(\"Avg run speed from julia: \", avg_run_s)\n",
    "\n",
    "    bandwidth = num_bytes / (avg_run_s * 2^20)\n",
    "    printfmt(\"Julia $(fname) popcount measured bandwidth of {:.1f} MiB/s\", bandwidth)\n",
    "    return avg_run_s\n",
    "end"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Avg run speed from julia: 0.04621711034\n",
      "Julia 2d byte array popcount measured bandwidth of 264.1 MiB/s"
     ]
    }
   ],
   "source": [
    "narrays = 100000\n",
    "array_bytes = 128\n",
    "arrays_bytes = rand(UInt8, (narrays, array_bytes))\n",
    "\n",
    "benchmark(arrays_bytes, popcount_2d_byte_arrays, \"2d byte array\", narrays * array_bytes);"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "So we have a starting point - `~270 MiB/s`. Now we saw that we were essentially calling `count_ones` on individual bytes, what would happen if our elements were a bit bigger - say machine word sized? Let's use a `UInt64` instead of a `UInt8`."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Avg run speed from julia: 0.00291228427\n",
      "Julia 2d UInt64 array popcount measured bandwidth of 4191.6 MiB/s"
     ]
    }
   ],
   "source": [
    "# Same function definition as before - just different input type.\n",
    "popcount_2d_uint64_arrays(arrays::Array{UInt64, 2}) =  sum(count_ones.(arrays), dims=2)\n",
    "\n",
    "# input array now has 8 times less elements.\n",
    "arrays_64 = rand(UInt64, (narrays, Int(array_bytes/8)));\n",
    "\n",
    "benchmark(arrays_64, popcount_2d_uint64_arrays, \"2d UInt64 array\", narrays * array_bytes);"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Well! Now we are talking in `GiB/s`. A good improvement over our first stab too. This is probably because the compiler can take advantage of a popcount instruction that works with a width of 64 bits. We can actually see this by calling `code_native`:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\t.text\n",
      "; Function count_ones {\n",
      "; Location: int.jl:358\n",
      "\tpopcntq\t%rdi, %rax\n",
      "\tretq\n",
      "\tnopw\t%cs:(%rax,%rax)\n",
      ";}\n"
     ]
    }
   ],
   "source": [
    "# The count_ones wraps the native popcntq instruction.\n",
    "code_native(count_ones, (UInt64,))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\t.text\n",
      "; Function count_ones {\n",
      "; Location: int.jl:358\n",
      "\tmovzbl\t%dil, %eax\n",
      "\tpopcntl\t%eax, %eax\n",
      "\tretq\n",
      "\tnopl\t(%rax)\n",
      ";}\n"
     ]
    }
   ],
   "source": [
    "# The count_ones wraps the native popcnt instructions\n",
    "code_native(count_ones, (UInt8,))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\t.text\n",
      "; Function count_ones {\n",
      "; Location: int.jl:358\n",
      "\tpopcntq\t%rsi, %rcx\n",
      "\tpopcntq\t%rdi, %rax\n",
      "\taddq\t%rcx, %rax\n",
      "\tretq\n",
      "\tnop\n",
      ";}\n"
     ]
    }
   ],
   "source": [
    "# The count_ones wraps the native popcnt instructions\n",
    "code_native(count_ones, (UInt128,))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Avg run speed from julia: 0.00171447827\n",
      "Julia 2d UInt128 array popcount measured bandwidth of 7120.0 MiB/s"
     ]
    }
   ],
   "source": [
    "# Well the UInt128 instructions are just calling the popcntq twice and adding the result (in assembly)\n",
    "# We should see a speed up by using 128bit integers.\n",
    "# Same function definition as before - just different input type.\n",
    "popcount_2d_uint128_arrays(arrays::Array{UInt128, 2}) =  sum(count_ones.(arrays), dims=2)\n",
    "\n",
    "# input array now has 16 times less elements.\n",
    "arrays_128 = rand(UInt128, (narrays, Int(array_bytes/16)));\n",
    "\n",
    "benchmark(arrays_128, popcount_2d_uint128_arrays, \"2d UInt128 array\", narrays * array_bytes);"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Avg run speed from julia: 0.01657183077\n",
      "Julia ntuple of uint64s popcount measured bandwidth of 7366.1 MiB/s"
     ]
    }
   ],
   "source": [
    "# Note 128 Bytes is 1024 bits. Which is 16 x 64bits\n",
    "# Lets assume that working on 4 x 64bits at a time is best able to be vectorized.\n",
    "fourwords = NTuple{4, UInt64}\n",
    "\n",
    "function fword_popcount(arrays)\n",
    "    counts = zeros(Int32, size(arrays)[1])\n",
    "    for i in eachindex(arrays)\n",
    "        pcount = 0\n",
    "        for j in 1:4\n",
    "            pcount += sum(count_ones.(arrays[i][j]))\n",
    "        end\n",
    "        counts[i] = pcount\n",
    "    end\n",
    "    return counts\n",
    "end\n",
    "\n",
    "make_fword() = [fourwords(rand(UInt64, 4)) for i in 1:4]\n",
    "narrays = 1000000\n",
    "fdata = [make_fword() for i in 1:narrays]\n",
    "\n",
    "benchmark(fdata, fword_popcount, \"ntuple of uint64s\", narrays * array_bytes);"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Some increased preformance for our efforts.\n",
    "\n",
    "Now a more natural datatype for the user might be an array of `BitArray` instances."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "((1000000,), (1024,))"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "bitarrays = [BitArray(rand(Bool, array_bytes * 8)) for i in 1:narrays];\n",
    "size(bitarrays), size(bitarrays[1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "534"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Pretty cool that this works\n",
    "count(bitarrays[1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Avg run speed from julia: 0.01412127291\n",
      "Julia broadcast count on BitArray popcount measured bandwidth of 8644.4 MiB/s"
     ]
    }
   ],
   "source": [
    "# Just broadcasting the builtin `count` on the bitarray\n",
    "popcount_broadcast_count(bitarrays) = count.(bitarrays)\n",
    "\n",
    "benchmark(bitarrays, popcount_broadcast_count, \"broadcast count on BitArray\", narrays * array_bytes);"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "`~9 GiB/s` using the builtin `count`. Well any half way sensible person would be happy with that."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "16-element Array{UInt64,1}:\n",
       " 0xff35c936649e4b09\n",
       " 0x99e0ef02ff3d8dbe\n",
       " 0x8e47077312897feb\n",
       " 0x4f5e80ccbb130238\n",
       " 0xc5e0cffe2acd7174\n",
       " 0x5abff90455268611\n",
       " 0x2f6645cd50f4d9c0\n",
       " 0x57eaaf7e6658b4e0\n",
       " 0x2abb4fe469c1dc8a\n",
       " 0xffa2383b0676a71e\n",
       " 0xbea07f0615cdd6c9\n",
       " 0x16fe9c48ede9b493\n",
       " 0xa671a5c4e61d3387\n",
       " 0xe0fd7542fae48ba1\n",
       " 0xc077d70393b29792\n",
       " 0x69bdc92b265316d2"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Observe that BitArray instances have a `.chunks` attribute\n",
    "# with raw UInt64 elements that we can use to efficiently popcount:\n",
    "bitarrays[1].chunks"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "16-element Array{Int64,1}:\n",
       " 34\n",
       " 38\n",
       " 34\n",
       " 28\n",
       " 36\n",
       " 30\n",
       " 31\n",
       " 36\n",
       " 33\n",
       " 35\n",
       " 34\n",
       " 35\n",
       " 32\n",
       " 34\n",
       " 32\n",
       " 32"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# So we can broadcast the `count_ones` function across these chunks\n",
    "count_ones.(bitarrays[1].chunks)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "534"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# And sum them up\n",
    "sum(count_ones.(bitarrays[1].chunks))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Avg run speed from julia: 0.06992416397000001\n",
      "Julia BitArray count_ones of chunks popcount measured bandwidth of 1745.8 MiB/s"
     ]
    }
   ],
   "source": [
    "popcount_ba_chunks(chunks::Array{UInt64}) = sum(count_ones.(chunks))\n",
    "    \n",
    "function popcount_sum_count_ones(bitarrays)\n",
    "    l = size(bitarrays)[1]\n",
    "    counts = zeros(Int64, l)\n",
    "    for i in 1:l\n",
    "        counts[i] = popcount_ba_chunks(bitarrays[i].chunks)\n",
    "    end\n",
    "    return counts\n",
    "end\n",
    "\n",
    "benchmark(bitarrays, popcount_sum_count_ones, \"BitArray count_ones of chunks\", narrays * array_bytes);"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Ok so clearly Julia knows a better way to count ones than that. Let's try something different and wrap some C++ code and call that from Julia... The original code can be found on [github](https://github.com/n1analytics/anonlink/blob/master/_cffi_build/dice_one_against_many.cpp#L49).\n",
    "\n",
    "I'm trying to make the benchmarking fair in that we will still preallocate the output, noting that particular speed up would help all of the methods."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "popcount (generic function with 2 methods)"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "popcount(arrays::Array{UInt8}, array_bytes::Int64, narrays::Int64) = popcount(zeros(Int32, narrays), arrays, array_bytes, narrays)\n",
    "function popcount(counts::Array{Int32}, arrays::Array{UInt8}, array_bytes::Int64, narrays::Int64)\n",
    "    elapsed_ms = ccall(\n",
    "        (:popcount_arrays, \"_countingbits.so\"), Cdouble,\n",
    "          (Ptr{Int32},Ptr{UInt8},Cint,Cint),\n",
    "          counts, arrays, narrays, array_bytes\n",
    "    )\n",
    "    return counts\n",
    "end"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Avg run speed from julia: 0.00864360435\n",
      "Julia C++ from anonlink popcount measured bandwidth of 14122.6 MiB/s"
     ]
    }
   ],
   "source": [
    "arrays = rand(UInt8, narrays * array_bytes)\n",
    "f(arrays) = popcount(arrays, array_bytes, narrays)\n",
    "\n",
    "benchmark(arrays, f, \"C++ from anonlink\", narrays * array_bytes);"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "You have no idea how worried I was that it wouldn't be faster! Still it isn't faster by much. Can you beat it?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "10-element Array{Tuple{Int64,Int64},1}:\n",
       " (504, 504)\n",
       " (497, 497)\n",
       " (537, 537)\n",
       " (496, 496)\n",
       " (492, 492)\n",
       " (520, 520)\n",
       " (505, 505)\n",
       " (504, 504)\n",
       " (492, 492)\n",
       " (507, 507)"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# To check it is correct we convert the memory layout and compare:\n",
    "reshaped_2d = reshape(arrays, (narrays, array_bytes))\n",
    "\n",
    "function convert_memory(x::Array{T, 2}) where T\n",
    "    ninds = axes(x, 1)\n",
    "    binds = axes(x, 2)\n",
    "    \n",
    "    out = similar(Array{T}, binds, ninds)\n",
    "    for col = ninds, row = binds\n",
    "        out[row, col] = x[col, row]\n",
    "    end\n",
    "    return out\n",
    "end\n",
    "\n",
    "a = [Int64(i) for i in popcount(convert_memory(reshaped_2d), array_bytes, narrays)]\n",
    "b = popcount_2d_byte_arrays(reshaped_2d)\n",
    "\n",
    "collect(zip(a, b))[1:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "true"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "count(a .== b) == length(a)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Julia 1.0.3",
   "language": "julia",
   "name": "julia-1.0"
  },
  "language_info": {
   "file_extension": ".jl",
   "mimetype": "application/julia",
   "name": "julia",
   "version": "1.0.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
	#include <memory>
	#include <functional>
	#include <algorithm>
	#include <vector>
	#include <queue>
	#include <stdint.h>
	#include <cstdlib>
	#include <ctime>
	#include <cassert>
	#include <climits>

	static constexpr int WORD_BYTES = sizeof(uint64_t);

	/**
	* The popcount of n elements of buf is the sum of c0, c1, c2, c3.
	*/
	template<int n>
	static inline void
	popcount(
	uint64_t &c0, uint64_t &c1, uint64_t &c2, uint64_t &c3,
	const uint64_t *buf) {
	popcount<4>(c0, c1, c2, c3, buf);
	popcount<n - 4>(c0, c1, c2, c3, buf + 4);
	}

	// Fast Path
	//
	// Source: http://danluu.com/assembly-intrinsics/
	// https://stackoverflow.com/questions/25078285/replacing-a-32-bit-loop-count-variable-with-64-bit-introduces-crazy-performance
	//
	// NB: Dan Luu's original assembly (and the SO answer it was based on)
	// is incorrect because it clobbers registers marked as "input only"
	// (see warning at
	// https://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html#InputOperands
	// -- this mistake does not materialise with GCC (4.9), but it does
	// with Clang (3.6 and 3.8)). We fix the mistake by explicitly
	// loading the contents of buf into registers and using these same
	// registers for the intermediate popcnts.
	/**
	* The popcount of 4 elements of buf is the sum of c0, c1, c2, c3.
	*/
	template<>
	inline void
	popcount<4>(
	uint64_t &c0, uint64_t &c1, uint64_t &c2, uint64_t &c3,
	const uint64_t *buf) {
	uint64_t b0, b1, b2, b3;
	b0 = buf[0]; b1 = buf[1]; b2 = buf[2]; b3 = buf[3];
	__asm__(
	"popcnt %4, %4 \n\t"
	"add %4, %0 \n\t"
	"popcnt %5, %5 \n\t"
	"add %5, %1 \n\t"
	"popcnt %6, %6 \n\t"
	"add %6, %2 \n\t"
	"popcnt %7, %7 \n\t"
	"add %7, %3 \n\t"
	: "+r" (c0), "+r" (c1), "+r" (c2), "+r" (c3),
	"+r" (b0), "+r" (b1), "+r" (b2), "+r" (b3));
	}

	// Slow paths
	// TODO: Assumes sizeof(long) == 8
	//
	// NB: The specialisation to n=3 is not currently used but included
	// for completeness (i.e. so that popcount<n> is defined for all
	// non-negative n) and in anticipation of its use in the near future.
	#if 0
	template<>
	inline void
	popcount<3>(
	uint64_t &c0, uint64_t &c1, uint64_t &c2, uint64_t &,
	const uint64_t *buf) {
	c0 += __builtin_popcountl(buf[0]);
	c1 += __builtin_popcountl(buf[1]);
	c2 += __builtin_popcountl(buf[2]);
	}
	#endif

	/**
	* The popcount of 2 elements of buf is the sum of c0, c1.
	*/
	template<>
	inline void
	popcount<2>(
	uint64_t &c0, uint64_t &c1, uint64_t &, uint64_t &,
	const uint64_t *buf) {
	c0 += __builtin_popcountl(buf[0]);
	c1 += __builtin_popcountl(buf[1]);
	}

	/**
	* The popcount *buf is in c0.
	*/
	template<>
	inline void
	popcount<1>(
	uint64_t &c0, uint64_t &, uint64_t &, uint64_t &,
	const uint64_t *buf) {
	c0 += __builtin_popcountl(buf[0]);
	}


	/**
	* Calculate population counts of an array of inputs of nwords elements.
	*
	* 'arrays' must point to narraysnwordsWORD_BYTES bytes
	* 'counts' must point to narrays*sizeof(uint32_t) bytes.
	* For i = 0 to narrays - 1, the population count of the nwords elements
	*
	* arrays[i * nwords] ... arrays[(i + 1) * nwords - 1]
	*
	* is put in counts[i].
	*/
	template<int nwords>
	static void
	_popcount_arrays(uint32_t counts, const uint64_t arrays, int narrays) {
	uint64_t c0, c1, c2, c3;
	for (int i = 0; i < narrays; ++i, arrays += nwords) {
	c0 = c1 = c2 = c3 = 0;
	popcount<nwords>(c0, c1, c2, c3, arrays);
	counts[i] = c0 + c1 + c2 + c3;
	}
	}

	/**
	* Return the popcount of the nwords elements starting at array.
	*/
	static uint32_t
	_popcount_array(const uint64_t *array, int nwords) {
	uint64_t c0, c1, c2, c3;
	c0 = c1 = c2 = c3 = 0;

	while (nwords >= 16) {
	popcount<16>(c0, c1, c2, c3, array);
	array += 16;
	nwords -= 16;
	}
	// nwords < 16
	if (nwords >= 8) {
	popcount<8>(c0, c1, c2, c3, array);
	array += 8;
	nwords -= 8;
	}
	// nwords < 8
	if (nwords >= 4) {
	popcount<4>(c0, c1, c2, c3, array);
	array += 4;
	nwords -= 4;
	}
	// nwords < 4
	if (nwords >= 2) {
	popcount<2>(c0, c1, c2, c3, array);
	array += 2;
	nwords -= 2;
	}
	// nwords < 2
	if (nwords == 1)
	popcount<1>(c0, c1, c2, c3, array);
	return c0 + c1 + c2 + c3;
	}

	/**
	* Convert clock measurement t to milliseconds.
	*
	* t should have been obtained as the difference of calls to clock()
	* for this to make sense.
	*/
	static inline double
	to_millis(clock_t t) {
	static constexpr double CPS = (double)CLOCKS_PER_SEC;
	return t * 1.0E3 / CPS;
	}

	static inline uint32_t
	abs_diff(uint32_t a, uint32_t b) {
	if (a > b)
	return a - b;
	return b - a;
	}


	typedef const uint64_t word_tp;
	typedef std::function<void (word_tp *)> deleter_fn;
	typedef std::unique_ptr<word_tp, deleter_fn> word_ptr;

	/**
	* Given a char pointer ptr pointing to nbytes bytes, return a
	* std::unique_ptr to uint64_t containing those same byte values
	* (using the host's byte order, i.e. no byte reordering is done).
	*
	* The main purpose of this function is to fix pointer alignment
	* issues when converting from a char pointer to a uint64 pointer; the
	* issue being that a uint64 pointer has stricter alignment
	* requirements than a char pointer.
	*
	* We assume that releasing the memory associated with ptr is someone
	* else's problem. So, if ptr is already correctly aligned, we just
	* cast it to uint64 and return a unique_ptr whose deleter is
	* empty. If ptr is misaligned, then we copy the nbytes at ptr to a
	* location that is correctly aligned and then return a unique_ptr
	* whose deleter will delete that allocated space when the unique_ptr
	* goes out of scope.
	*
	* NB: ASSUMES nbytes is divisible by WORD_BYTES.
	*
	* TODO: On some architectures it might be advantageous to have
	* 16-byte aligned memory, even when the words used are only 8 bytes
	* (this is to do with cache line size). This function could be easily
	* modified to allow experimentation with different alignments.
	*/
	static word_ptr
	adjust_ptr_alignment(const char *ptr, size_t nbytes) {
	static constexpr size_t wordbytes = sizeof(word_tp);
	size_t nwords = nbytes / wordbytes;
	uintptr_t addr = reinterpret_cast<uintptr_t>(ptr);
	// ptr is correctly aligned if addr is divisible by wordbytes
	if (addr % wordbytes != 0) {
	// ptr is NOT correctly aligned

	// copy_words has correct alignment
	uint64_t *copy_words = new uint64_t[nwords];
	// alias copy_words with a byte pointer; the cast safe because
	// uint8_t alignment is less restrictive than uint64_t
	uint8_t copy_bytes = reinterpret_cast<uint8_t >(copy_words);
	// copy everything across
	std::copy(ptr, ptr + nbytes, copy_bytes);
	// return a unique_ptr with array delete to delete copy_words
	auto array_delete = [] (word_tp *p) { delete[] p; };
	return word_ptr(copy_words, array_delete);
	} else {
	// ptr IS correctly aligned

	// safe cast because the address of ptr is wordbyte-aligned.
	word_tp same = reinterpret_cast<word_tp >(ptr);
	// don't delete backing array since we don't own it
	auto empty_delete = [] (word_tp *) { };
	return word_ptr(same, empty_delete);
	}
	}


	extern "C"
	{
	/**
	* Calculate population counts of an array of inputs; return how
	* long it took in milliseconds.
	*
	* 'arrays' must point to narrays*array_bytes bytes
	* 'counts' must point to narrays*sizeof(uint32_t) bytes.
	* For i = 0 to n - 1, the population count of the array_bytes*8 bits
	*
	* arrays[i * array_bytes] ... arrays[(i + 1) * array_bytes - 1]
	*
	* is put in counts[i].
	*
	* ASSUMES: array_bytes is divisible by 8.
	*/
	double
	popcount_arrays(
	uint32_t *counts,
	const char *arrays, int narrays, int array_bytes) {
	// assumes WORD_BYTES divides array_bytes
	int nwords = array_bytes / WORD_BYTES;
	// The static_cast is to avoid int overflow in the multiplication
	size_t total_bytes = static_cast<size_t>(narrays) * array_bytes;
	auto ptr = adjust_ptr_alignment(arrays, total_bytes);
	auto u = ptr.get();

	clock_t t = clock();
	switch (nwords) {
	case 32: _popcount_arrays<32>(counts, u, narrays); break;
	case 16: _popcount_arrays<16>(counts, u, narrays); break;
	case 8: _popcount_arrays< 8>(counts, u, narrays); break;
	default:
	for (int i = 0; i < narrays; ++i, u += nwords)
	counts[i] = _popcount_array(u, nwords);
	}
	return to_millis(clock() - t);
	}
	}
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"So the idea is to popcount vectors of 128 byte data as fast as possible. Want to try see if I can do it faster in C++ or Julia..."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"128"
	]
	},
	"execution_count": 1,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"# Some random data to popcount - should expect mean popcount per 128 bytes to be 512.\n",
	"using Formatting: printfmt\n",
	"narrays = 10\n",
	"array_bytes = 128"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"10×128 Array{Int64,2}:\n",
	" 2 6 4 4 4 1 3 4 4 7 7 3 4 … 5 2 3 3 4 1 3 4 5 3 4 2\n",
	" 5 4 5 1 5 5 4 3 5 3 4 2 5 6 3 5 5 5 6 6 3 5 2 6 3\n",
	" 1 5 1 5 4 5 4 4 5 4 4 3 6 1 4 3 5 2 5 1 6 4 5 6 6\n",
	" 7 3 6 2 4 5 3 6 5 5 3 4 7 3 3 4 3 3 5 3 2 4 6 3 3\n",
	" 5 6 2 2 4 5 4 2 5 3 5 4 2 2 3 3 4 3 6 2 6 4 6 5 4\n",
	" 3 3 6 6 4 6 2 3 3 6 3 3 5 … 3 4 4 4 7 5 3 4 6 3 3 2\n",
	" 4 3 3 4 8 1 6 6 5 3 4 4 5 5 3 5 7 5 5 4 3 3 4 5 4\n",
	" 7 6 7 2 4 4 6 6 4 4 3 4 2 3 5 3 4 5 5 1 3 6 7 3 4\n",
	" 4 5 4 2 3 4 3 5 2 4 1 5 5 3 4 3 2 3 5 2 6 4 7 6 5\n",
	" 4 4 4 3 4 3 5 5 3 7 6 1 4 2 5 4 3 5 3 2 2 6 2 5 4"
	]
	},
	"execution_count": 2,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"# Let's assume our data is in a 2D UInt8 array\n",
	"arrays = rand(UInt8, (narrays, array_bytes));\n",
	"\n",
	"# Perhaps we can count ones of the rows\n",
	"count_ones.(arrays)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"10×1 Array{Int64,2}:\n",
	" 509\n",
	" 519\n",
	" 523\n",
	" 492\n",
	" 516\n",
	" 493\n",
	" 528\n",
	" 524\n",
	" 507\n",
	" 519"
	]
	},
	"execution_count": 3,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"# That is running popcount on each UInt8 which probably isn't going to be great\n",
	"# for performance, but let's see how it plays out.\n",
	"\n",
	"# We will make a function out of broadcasting `count_ones` for easier benchmarking\n",
	"popcount_2d_byte_arrays(arrays::Array{UInt8, 2}) = sum(count_ones.(arrays), dims=2)\n",
	"\n",
	"popcount_2d_byte_arrays(arrays)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Well that seems to work, let's time it."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"benchmark (generic function with 1 method)"
	]
	},
	"execution_count": 4,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"function benchmark(arrays, popcountf, fname, num_bytes)\n",
	" num_runs = 100\n",
	" # Call it once to ensure function is compiled\n",
	" popcountf(arrays)\n",
	" total_time = @elapsed for run in 1:num_runs\n",
	" counts = popcountf(arrays)\n",
	" end\n",
	" avg_run_s = total_time/num_runs\n",
	"\n",
	" println(\"Avg run speed from julia: \", avg_run_s)\n",
	"\n",
	" bandwidth = num_bytes / (avg_run_s * 2^20)\n",
	" printfmt(\"Julia $(fname) popcount measured bandwidth of {:.1f} MiB/s\", bandwidth)\n",
	" return avg_run_s\n",
	"end"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Avg run speed from julia: 0.04621711034\n",
	"Julia 2d byte array popcount measured bandwidth of 264.1 MiB/s"
	]
	}
	],
	"source": [
	"narrays = 100000\n",
	"array_bytes = 128\n",
	"arrays_bytes = rand(UInt8, (narrays, array_bytes))\n",
	"\n",
	"benchmark(arrays_bytes, popcount_2d_byte_arrays, \"2d byte array\", narrays * array_bytes);"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"So we have a starting point - `~270 MiB/s`. Now we saw that we were essentially calling `count_ones` on individual bytes, what would happen if our elements were a bit bigger - say machine word sized? Let's use a `UInt64` instead of a `UInt8`."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Avg run speed from julia: 0.00291228427\n",
	"Julia 2d UInt64 array popcount measured bandwidth of 4191.6 MiB/s"
	]
	}
	],
	"source": [
	"# Same function definition as before - just different input type.\n",
	"popcount_2d_uint64_arrays(arrays::Array{UInt64, 2}) = sum(count_ones.(arrays), dims=2)\n",
	"\n",
	"# input array now has 8 times less elements.\n",
	"arrays_64 = rand(UInt64, (narrays, Int(array_bytes/8)));\n",
	"\n",
	"benchmark(arrays_64, popcount_2d_uint64_arrays, \"2d UInt64 array\", narrays * array_bytes);"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Well! Now we are talking in `GiB/s`. A good improvement over our first stab too. This is probably because the compiler can take advantage of a popcount instruction that works with a width of 64 bits. We can actually see this by calling `code_native`:"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"\t.text\n",
	"; Function count_ones {\n",
	"; Location: int.jl:358\n",
	"\tpopcntq\t%rdi, %rax\n",
	"\tretq\n",
	"\tnopw\t%cs:(%rax,%rax)\n",
	";}\n"
	]
	}
	],
	"source": [
	"# The count_ones wraps the native popcntq instruction.\n",
	"code_native(count_ones, (UInt64,))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"\t.text\n",
	"; Function count_ones {\n",
	"; Location: int.jl:358\n",
	"\tmovzbl\t%dil, %eax\n",
	"\tpopcntl\t%eax, %eax\n",
	"\tretq\n",
	"\tnopl\t(%rax)\n",
	";}\n"
	]
	}
	],
	"source": [
	"# The count_ones wraps the native popcnt instructions\n",
	"code_native(count_ones, (UInt8,))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 9,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"\t.text\n",
	"; Function count_ones {\n",
	"; Location: int.jl:358\n",
	"\tpopcntq\t%rsi, %rcx\n",
	"\tpopcntq\t%rdi, %rax\n",
	"\taddq\t%rcx, %rax\n",
	"\tretq\n",
	"\tnop\n",
	";}\n"
	]
	}
	],
	"source": [
	"# The count_ones wraps the native popcnt instructions\n",
	"code_native(count_ones, (UInt128,))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 10,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Avg run speed from julia: 0.00171447827\n",
	"Julia 2d UInt128 array popcount measured bandwidth of 7120.0 MiB/s"
	]
	}
	],
	"source": [
	"# Well the UInt128 instructions are just calling the popcntq twice and adding the result (in assembly)\n",
	"# We should see a speed up by using 128bit integers.\n",
	"# Same function definition as before - just different input type.\n",
	"popcount_2d_uint128_arrays(arrays::Array{UInt128, 2}) = sum(count_ones.(arrays), dims=2)\n",
	"\n",
	"# input array now has 16 times less elements.\n",
	"arrays_128 = rand(UInt128, (narrays, Int(array_bytes/16)));\n",
	"\n",
	"benchmark(arrays_128, popcount_2d_uint128_arrays, \"2d UInt128 array\", narrays * array_bytes);"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 11,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Avg run speed from julia: 0.01657183077\n",
	"Julia ntuple of uint64s popcount measured bandwidth of 7366.1 MiB/s"
	]
	}
	],
	"source": [
	"# Note 128 Bytes is 1024 bits. Which is 16 x 64bits\n",
	"# Lets assume that working on 4 x 64bits at a time is best able to be vectorized.\n",
	"fourwords = NTuple{4, UInt64}\n",
	"\n",
	"function fword_popcount(arrays)\n",
	" counts = zeros(Int32, size(arrays)[1])\n",
	" for i in eachindex(arrays)\n",
	" pcount = 0\n",
	" for j in 1:4\n",
	" pcount += sum(count_ones.(arrays[i][j]))\n",
	" end\n",
	" counts[i] = pcount\n",
	" end\n",
	" return counts\n",
	"end\n",
	"\n",
	"make_fword() = [fourwords(rand(UInt64, 4)) for i in 1:4]\n",
	"narrays = 1000000\n",
	"fdata = [make_fword() for i in 1:narrays]\n",
	"\n",
	"benchmark(fdata, fword_popcount, \"ntuple of uint64s\", narrays * array_bytes);"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Some increased preformance for our efforts.\n",
	"\n",
	"Now a more natural datatype for the user might be an array of `BitArray` instances."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 12,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"((1000000,), (1024,))"
	]
	},
	"execution_count": 12,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"bitarrays = [BitArray(rand(Bool, array_bytes * 8)) for i in 1:narrays];\n",
	"size(bitarrays), size(bitarrays[1])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 13,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"534"
	]
	},
	"execution_count": 13,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"# Pretty cool that this works\n",
	"count(bitarrays[1])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 14,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Avg run speed from julia: 0.01412127291\n",
	"Julia broadcast count on BitArray popcount measured bandwidth of 8644.4 MiB/s"
	]
	}
	],
	"source": [
	"# Just broadcasting the builtin `count` on the bitarray\n",
	"popcount_broadcast_count(bitarrays) = count.(bitarrays)\n",
	"\n",
	"benchmark(bitarrays, popcount_broadcast_count, \"broadcast count on BitArray\", narrays * array_bytes);"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"`~9 GiB/s` using the builtin `count`. Well any half way sensible person would be happy with that."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 15,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"16-element Array{UInt64,1}:\n",
	" 0xff35c936649e4b09\n",
	" 0x99e0ef02ff3d8dbe\n",
	" 0x8e47077312897feb\n",
	" 0x4f5e80ccbb130238\n",
	" 0xc5e0cffe2acd7174\n",
	" 0x5abff90455268611\n",
	" 0x2f6645cd50f4d9c0\n",
	" 0x57eaaf7e6658b4e0\n",
	" 0x2abb4fe469c1dc8a\n",
	" 0xffa2383b0676a71e\n",
	" 0xbea07f0615cdd6c9\n",
	" 0x16fe9c48ede9b493\n",
	" 0xa671a5c4e61d3387\n",
	" 0xe0fd7542fae48ba1\n",
	" 0xc077d70393b29792\n",
	" 0x69bdc92b265316d2"
	]
	},
	"execution_count": 15,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"# Observe that BitArray instances have a `.chunks` attribute\n",
	"# with raw UInt64 elements that we can use to efficiently popcount:\n",
	"bitarrays[1].chunks"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 16,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"16-element Array{Int64,1}:\n",
	" 34\n",
	" 38\n",
	" 34\n",
	" 28\n",
	" 36\n",
	" 30\n",
	" 31\n",
	" 36\n",
	" 33\n",
	" 35\n",
	" 34\n",
	" 35\n",
	" 32\n",
	" 34\n",
	" 32\n",
	" 32"
	]
	},
	"execution_count": 16,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"# So we can broadcast the `count_ones` function across these chunks\n",
	"count_ones.(bitarrays[1].chunks)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 17,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"534"
	]
	},
	"execution_count": 17,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"# And sum them up\n",
	"sum(count_ones.(bitarrays[1].chunks))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 23,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Avg run speed from julia: 0.06992416397000001\n",
	"Julia BitArray count_ones of chunks popcount measured bandwidth of 1745.8 MiB/s"
	]
	}
	],
	"source": [
	"popcount_ba_chunks(chunks::Array{UInt64}) = sum(count_ones.(chunks))\n",
	" \n",
	"function popcount_sum_count_ones(bitarrays)\n",
	" l = size(bitarrays)[1]\n",
	" counts = zeros(Int64, l)\n",
	" for i in 1:l\n",
	" counts[i] = popcount_ba_chunks(bitarrays[i].chunks)\n",
	" end\n",
	" return counts\n",
	"end\n",
	"\n",
	"benchmark(bitarrays, popcount_sum_count_ones, \"BitArray count_ones of chunks\", narrays * array_bytes);"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Ok so clearly Julia knows a better way to count ones than that. Let's try something different and wrap some C++ code and call that from Julia... The original code can be found on [github](https://github.com/n1analytics/anonlink/blob/master/_cffi_build/dice_one_against_many.cpp#L49).\n",
	"\n",
	"I'm trying to make the benchmarking fair in that we will still preallocate the output, noting that particular speed up would help all of the methods."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 19,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"popcount (generic function with 2 methods)"
	]
	},
	"execution_count": 19,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"popcount(arrays::Array{UInt8}, array_bytes::Int64, narrays::Int64) = popcount(zeros(Int32, narrays), arrays, array_bytes, narrays)\n",
	"function popcount(counts::Array{Int32}, arrays::Array{UInt8}, array_bytes::Int64, narrays::Int64)\n",
	" elapsed_ms = ccall(\n",
	" (:popcount_arrays, \"_countingbits.so\"), Cdouble,\n",
	" (Ptr{Int32},Ptr{UInt8},Cint,Cint),\n",
	" counts, arrays, narrays, array_bytes\n",
	" )\n",
	" return counts\n",
	"end"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 20,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Avg run speed from julia: 0.00864360435\n",
	"Julia C++ from anonlink popcount measured bandwidth of 14122.6 MiB/s"
	]
	}
	],
	"source": [
	"arrays = rand(UInt8, narrays * array_bytes)\n",
	"f(arrays) = popcount(arrays, array_bytes, narrays)\n",
	"\n",
	"benchmark(arrays, f, \"C++ from anonlink\", narrays * array_bytes);"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"You have no idea how worried I was that it wouldn't be faster! Still it isn't faster by much. Can you beat it?"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 21,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"10-element Array{Tuple{Int64,Int64},1}:\n",
	" (504, 504)\n",
	" (497, 497)\n",
	" (537, 537)\n",
	" (496, 496)\n",
	" (492, 492)\n",
	" (520, 520)\n",
	" (505, 505)\n",
	" (504, 504)\n",
	" (492, 492)\n",
	" (507, 507)"
	]
	},
	"execution_count": 21,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"# To check it is correct we convert the memory layout and compare:\n",
	"reshaped_2d = reshape(arrays, (narrays, array_bytes))\n",
	"\n",
	"function convert_memory(x::Array{T, 2}) where T\n",
	" ninds = axes(x, 1)\n",
	" binds = axes(x, 2)\n",
	" \n",
	" out = similar(Array{T}, binds, ninds)\n",
	" for col = ninds, row = binds\n",
	" out[row, col] = x[col, row]\n",
	" end\n",
	" return out\n",
	"end\n",
	"\n",
	"a = [Int64(i) for i in popcount(convert_memory(reshaped_2d), array_bytes, narrays)]\n",
	"b = popcount_2d_byte_arrays(reshaped_2d)\n",
	"\n",
	"collect(zip(a, b))[1:10]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 22,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"true"
	]
	},
	"execution_count": 22,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"count(a .== b) == length(a)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Julia 1.0.3",
	"language": "julia",
	"name": "julia-1.0"
	},
	"language_info": {
	"file_extension": ".jl",
	"mimetype": "application/julia",
	"name": "julia",
	"version": "1.0.3"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}