Skip to content

Instantly share code, notes, and snippets.

@zed
Created April 2, 2010 10:20
Show Gist options
  • Star 3 You must be signed in to star a gist
  • Fork 2 You must be signed in to fork a gist
  • Save zed/353005 to your computer and use it in GitHub Desktop.
Save zed/353005 to your computer and use it in GitHub Desktop.
xor string: numpy vs. pyublas vs. fortran vs. C vs. Cython vs. Boost.Python
boost-python/bin/
*.py[co]
*.so
*.o
/xorcy.c
# Copyright David Abrahams 2006. Distributed under the Boost
# Software License, Version 1.0. (See accompanying
# file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
# Edit this path to point at the tools/build/v2 subdirectory of your
# Boost installation. Absolute paths work, too.
boost-build /usr/share/doc/libboost1.40-doc/examples/tools/build/v2 ;
#!/usr/bin/env python
"""
Usage:
%prog modulename.f90.template [outfilename] [--int]
"""
from __future__ import print_function
import os, string, sys
def main():
only_ints = '--int' in sys.argv
if only_ints:
sys.argv.remove('--int')
filename = sys.argv[1]
if len(sys.argv) > 2:
outfilename = sys.argv[2]
else:
outfilename = os.path.splitext(os.path.basename(filename))[0]
with open(filename) as fin:
substitute = string.Template(fin.read()).substitute
types = [('%s*%d' % (type_,i), '%s%d' % (type_sig, i*8))
for i in (1, 2, 4, 8)
for type_, type_sig in [('real', 'float'),
('integer', 'int'),
('unsigned', 'uint')][only_ints:2]
if not (type_ == 'real' and i in (1, 2, 16))]
with open(outfilename, 'w') as fout:
for type_, type_sig in types:
print(substitute(dict(type=type_, type_sig=type_sig)), file=fout)
if __name__=="__main__":
main()
# Copyright David Abrahams 2006. Distributed under the Boost
# Software License, Version 1.0. (See accompanying
# file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
using python ;
# Specify that the boost-python library exists under the name
# boost_python. That is, because the library was installed at the
# standard search path as /usr/lib/libboost_python.so, bjam will find
# it automatically. No need to specify the absolute path.
lib libboost_python : : <name>boost_python ;
# boost_python library.
project : requirements
<library>libboost_python
<include>/usr/local/lib/python2.6/dist-packages/PyUblas-0.93.1-py2.6-linux-x86_64.egg/include ;
# Make the definition of the python-extension rule available
import python ;
# Declare a Python extension
python-extension xorcpp : xor.cpp ;
import testing ;
# Declare a test of the extension module
testing.make-test run-pyd : xorcpp test_xorcpp.py : : test_ext ;
# Create a "test" target that runs all the tests
alias test : test_ext ;
# make sure the tests don't run by default
explicit test_ext test ;
NAME=xorf
measure_performance: xorcy.so $(NAME).so xorcpp.so
# test that all function produce the same result
python xor_main.py
# measure performance
python -mtimeit -s'from xor_main import aa, bb, slow_xor' 'slow_xor(aa, bb)'
-python -mtimeit -s'from xor_main import aa, bb; from xorcpp import xorcpp' 'xorcpp(aa, bb)'
-python -mtimeit -s'from xor_main import aa, bb; from xorcpp import xorcpp_inplace' 'xorcpp_inplace(aa, bb)'
-python -mtimeit -s'from xor_main import aa, bb; from xor_strings import xorcpp' 'xorcpp(aa, bb, "int16")'
-python -mtimeit -s'from xor_main import aa, bb; from xor_strings import xorcpp' 'xorcpp(aa, bb, "int32")'
-python -mtimeit -s'from xor_main import aa, bb; from xor_strings import xorcpp' 'xorcpp(aa, bb, "int64")'
-python -mtimeit -s'from xor_main import aa, bb; from xor_strings import xorcpp' 'xorcpp(aa, bb, "uint32")'
-python -mtimeit -s'from xor_main import aa, bb; from xor_strings import xorcpp' 'xorcpp(aa, bb, "uint64")'
python -mtimeit -s'from xor_main import aa, bb, xor_strings' 'xor_strings(aa, bb, "int16")'
python -mtimeit -s'from xor_main import aa, bb, xor_strings' 'xor_strings(aa, bb, "int32")'
python -mtimeit -s'from xor_main import aa, bb, xor_strings' 'xor_strings(aa, bb, "int64")'
python -mtimeit -s'from xor_main import aa, bb; from marek import faster_slow_xor' 'faster_slow_xor(aa,bb)'
python -mtimeit -s'from xor_main import aa, bb; from marek import inline_xor' 'inline_xor(aa,bb)'
python -mtimeit -s'from xor_main import aa, bb, xor' 'xor(aa, bb)'
-python -mtimeit -s'from xor_main import aa, bb; from xor_strings import xorcpp_inplace' 'xorcpp_inplace(aa, bb, "int8")'
-python -mtimeit -s'from xor_main import aa, bb; from xor_strings import xorcpp_inplace' 'xorcpp_inplace(aa, bb, "int16")'
-python -mtimeit -s'from xor_main import aa, bb; from xor_strings import xorcpp_inplace' 'xorcpp_inplace(aa, bb, "int32")'
-python -mtimeit -s'from xor_main import aa, bb; from xor_strings import xorcpp_inplace' 'xorcpp_inplace(aa, bb, "int64")'
-python -mtimeit -s'from xor_main import aa, bb; from xor_strings import xorcpp_inplace' 'xorcpp_inplace(aa, bb, "uint8")'
-python -mtimeit -s'from xor_main import aa, bb; from xor_strings import xorcpp_inplace' 'xorcpp_inplace(aa, bb, "uint16")'
-python -mtimeit -s'from xor_main import aa, bb; from xor_strings import xorcpp_inplace' 'xorcpp_inplace(aa, bb, "uint32")'
-python -mtimeit -s'from xor_main import aa, bb; from xor_strings import xorcpp_inplace' 'xorcpp_inplace(aa, bb, "uint64")'
python -mtimeit -s'from xor_main import aa, bb, xor_vectorised' 'xor_vectorised(aa, bb)'
python -mtimeit -s'from xor_main import aa, bb, inline_xor_nocopy' 'inline_xor_nocopy(aa, bb)'
# | function | time, usec | ratio | type |
# |-------------------------+------------+-------+--------------|
# | slow_xor | 2020 | 1.0 | numpy |
# | xorcpp (int64) | 2010 | 1.0 | pyublas |
# | xorcpp (uint64) | 1980 | 1.0 | pyublas |
# | xorf_int16 | 1570 | 1.3 | fortran |
# | xorf_int32 | 1530 | 1.3 | fortran |
# | xorf_int64 | 1420 | 1.4 | fortran |
# | faster_slow_xor | 1360 | 1.5 | numpy |
# | inline_xor | 1280 | 1.6 | C |
# | cython_xor | 1290 | 1.6 | cython |
# | xorcpp (int16) | 1060 | 1.9 | pyublas |
# | xorcpp (int8) | 1040 | 1.9 | pyublas |
# | xorcpp (int32) | 1000 | 2.0 | pyublas |
# | xorcpp_byte | 990 | 2.0 | pyublas |
# | xorcpp (uint32) | 985 | 2.1 | pyublas |
# | xorcpp_inplace (uint32) | 442 | 4.6 | pyublas |
# | xorcpp_inplace (int32) | 440 | 4.6 | pyublas |
# | cython_xor_vectorised | 325 | 6.2 | cython |
# | inline_xor_nocopy | 172 | 11.7 | C |
# | xorcpp | 144 | 14.0 | boost.python |
# | xorcpp_inplace | 122 | 16.6 | boost.python |
# #+TBLFM: $3=@2$2/$2;%.1f
xorcpp: xorcpp.so
python -mtimeit -s'from xor_main import aa, bb; from xorcpp import xorcpp' 'xorcpp(aa, bb)'
python -mtimeit -s'from xor_main import aa, bb; from xorcpp import xorcpp_inplace' 'xorcpp_inplace(aa, bb)'
xorcpp_inplace: xorcpp.so
python -mtimeit -s'from xor_main import aa, bb; from xor_strings import xorcpp_inplace' 'xorcpp_inplace(aa, bb, "int32")'
xorcpp.so: boost-python/xor.cpp
-cd boost-python && bjam release
#NOTE: hard-coded path with version
-cp boost-python/bin/gcc-4.4.1/release/* ./
xorcy.so: xorcy.pyx setup.py
python setup.py build
#NOTE: hard-coded path
cp build/lib.linux-x86_64-2.6/xorcy.so ./
install_xorcy: xorcy.pyx setup.py
python setup.py install
%.so: %.pyf %.f90
f2py --no-wrap-functions -c $^
%.pyf: %.f90
f2py --overwrite-signature -h $@ -m $* $<
# $@ - current target
# $* '%'-part (works if there *is* '%' in specification)
# $< first dependence
# $^ all dependencies (without duplicates)
%.f90: %.f90.template generate_fortran.py
python generate_fortran.py --int $<
clean:
-rm *.so
"""See
http://stackoverflow.com/questions/2119761/simple-python-challenge-fastest-bitwise-xor-on-data-buffers/2179687#2179687
"""
import numpy
from scipy import weave
def faster_slow_xor(aa,bb):
b = numpy.fromstring(bb, dtype=numpy.uint64)
numpy.bitwise_xor(numpy.frombuffer(aa,dtype=numpy.uint64), b, b)
return b.tostring()
code = """
const __m128i* pa = (__m128i*)a;
const __m128i* pend = (__m128i*)(a + arr_size);
__m128i* pb = (__m128i*)b;
__m128i xmm1, xmm2;
while (pa < pend) {
xmm1 = _mm_loadu_si128(pa); // must use unaligned access
xmm2 = _mm_load_si128(pb); // numpy will align at 16 byte boundaries
_mm_store_si128(pb, _mm_xor_si128(xmm1, xmm2));
++pa;
++pb;
}
"""
def inline_xor(aa, bb):
a = numpy.frombuffer(aa, dtype=numpy.uint64)
b = numpy.fromstring(bb, dtype=numpy.uint64)
arr_size = a.shape[0]
weave.inline(code, ["a", "b", "arr_size"], headers = ['"emmintrin.h"'])
return b.tostring()
support = """
#define ALIGNMENT 16
static void memxor(const char* in1, const char* in2, char* out, ssize_t n) {
const char* end = in1 + n;
while (in1 < end) {
*out = *in1 ^ *in2;
++in1;
++in2;
++out;
}
}
"""
code2 = """
PyObject* res = PyString_FromStringAndSize(NULL, real_size);
const ssize_t tail = (ssize_t)PyString_AS_STRING(res) % ALIGNMENT;
const ssize_t head = (ALIGNMENT - tail) % ALIGNMENT;
memxor((const char*)a, (const char*)b, PyString_AS_STRING(res), head);
const __m128i* pa = (__m128i*)((char*)a + head);
const __m128i* pend = (__m128i*)((char*)a + real_size - tail);
const __m128i* pb = (__m128i*)((char*)b + head);
__m128i xmm1, xmm2;
__m128i* pc = (__m128i*)(PyString_AS_STRING(res) + head);
while (pa < pend) {
xmm1 = _mm_loadu_si128(pa);
xmm2 = _mm_loadu_si128(pb);
_mm_stream_si128(pc, _mm_xor_si128(xmm1, xmm2));
++pa;
++pb;
++pc;
}
memxor((const char*)pa, (const char*)pb, (char*)pc, tail);
return_val = res;
Py_DECREF(res);
"""
def inline_xor_nocopy(aa, bb):
real_size = len(aa)
a = numpy.frombuffer(aa, dtype=numpy.uint64)
b = numpy.frombuffer(bb, dtype=numpy.uint64)
return weave.inline(code2, ["a", "b", "real_size"],
headers = ['"emmintrin.h"'],
support_code = support)
from distutils.core import setup
from distutils.extension import Extension
from Cython.Distutils import build_ext
setup(
name = "xorcy",
ext_modules=[Extension("xorcy", ["xorcy.pyx"])],
cmdclass = {'build_ext': build_ext}
)
# Copyright Ralf W. Grosse-Kunstleve 2006. Distributed under the Boost
# Software License, Version 1.0. (See accompanying
# file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
# Using the doctest module here to ensure that the results are as expected.
r'''>>> from xorcpp import *
>>> a, b = 'ab', 'bac'
>>> xorcpp(b, a)
Traceback (most recent call last):
...
ValueError: check len(a) <= len(b) failed
>>> c = xorcpp(a, b)
>>> c, b, a
('\x03\x03', 'bac', 'ab')
>>> a == 'ab' and b == 'bac'
True
>>> xorcpp_inplace(a, b)
'\x03\x03c'
>>> b == '\x03\x03c' and c == b[:len(a)]
True
>>> a == 'ab'
True
>>> xorcpp('abc', 'def')
'\x05\x07\x05'
>>> xorcpp('abc', 'bac')
'\x03\x03\x00'
>>> xorcpp('cab', 'cba')
'\x00\x03\x03'
>>> import numpy as np
>>> import pyublas # import converters
>>> a = np.array(map(ord, 'abc'))
>>> b = np.array(map(ord, 'def'))
>>> a_copy = a.copy()
>>> b_copy = b.copy()
>>> d = xorcpp(a, b)
>>> d
array([5, 7, 5])
>>> all(a == a_copy) and all(b == b_copy)
True
>>> xorcpp_inplace(a, b)
array([5, 7, 5])
>>> all(a == a_copy)
True
>>> all(b == b_copy)
False
>>> all(b == d)
True
>>> N = 100
>>> all('\x03'*n == xorcpp('a'*n, 'b'*(n+3)) for n in range(N))
True
>>> m = 19
>>> all(('\x03'*n + 'b'*m) == xorcpp_inplace('a'*n, 'b'*(n+m)) for n in range(N))
True
'''
def run(args = None):
if args is not None:
import sys
sys.argv = args
import doctest, test_xorcpp
return doctest.testmod(test_xorcpp, verbose=True)
if __name__ == '__main__':
import sys
sys.exit(run()[0])
#include <inttypes.h>
#include <algorithm>
#include <cassert>
#include <iterator>
#include <boost/lambda/lambda.hpp>
#include <boost/python.hpp>
#include <pyublas/numpy.hpp>
namespace {
namespace py = boost::python;
template<class InputIterator, class InputIterator2, class OutputIterator>
OutputIterator
xor_(InputIterator first, InputIterator last,
InputIterator2 first2, OutputIterator result) {
// `result` might be `first` but not any of the input iterators
namespace ll = boost::lambda;
typedef typename std::iterator_traits<OutputIterator>::value_type Int;
return std::transform(first, last, first2, result,
ll::ret<Int>(ll::_1 ^ ll::_2));
}
template<class T>
void
xor_aligned(const char* ai, size_t n, const char* bi, char* ci) {
const size_t alignment = std::max(sizeof(T), 16ul);
if (n < 2*alignment) {
xor_(ai, ai + n, bi, ci);
}
else {
assert(n >= 2*alignment);
// applying Marek's algorithm to align
const ptrdiff_t head = ((alignment -
(reinterpret_cast<uintptr_t>(ci) % alignment)) %
alignment);
const ptrdiff_t tail = (n - (reinterpret_cast<uintptr_t>(ci + n) %
alignment));
xor_(ai, ai + head, bi, ci);
xor_(reinterpret_cast<const T*>(ai + head),
reinterpret_cast<const T*>(ai + tail),
reinterpret_cast<const T*>(bi + head),
reinterpret_cast<T*>(ci + head));
xor_(ai + tail, ai + n, bi + tail, ci + tail);
}
}
void check_len(size_t a, size_t b) {
if (a <= b) return;
PyErr_SetString(PyExc_ValueError, "check len(a) <= len(b) failed");
throw py::error_already_set();
}
template<class T>
PyObject*
xorcpp_str(const py::str& a, const py::str& b) {
const size_t n = py::len(a);
check_len(n, py::len(b));
PyObject* res = PyString_FromStringAndSize(NULL, n);
if (!res) throw py::error_already_set();
xor_aligned<T>(py::extract<const char*>(a), n,
py::extract<const char*>(b), PyString_AS_STRING(res));
return res; // return *New reference*
}
template<class T>
py::str
xorcpp_str_inplace(const py::str& a, py::str& b) {
//NOTE: this function modifies internal buffer of `b`
const size_t n = py::len(a);
check_len(n, py::len(b));
char* bi = py::extract<char*>(b);
xor_aligned<T>(bi, n, py::extract<const char*>(a), bi);
return b;
}
template<class Int>
pyublas::numpy_vector<Int>
xorcpp_pyublas(const pyublas::numpy_vector<Int>& a,
const pyublas::numpy_vector<Int>& b) {
check_len(a.size(), b.size());
pyublas::numpy_vector<Int> s(a.size());
xor_(a.begin(), a.end(), b.begin(), s.begin());
return s;
}
template<class Int>
pyublas::numpy_vector<Int>
xorcpp_pyublas_inplace(const pyublas::numpy_vector<Int>& a,
pyublas::numpy_vector<Int> b) {
check_len(a.size(), b.size());
xor_(b.begin(), b.begin() + a.size(), a.begin(), b.begin());
return b;
}
}
BOOST_PYTHON_MODULE(xorcpp)
{
namespace py = boost::python;
typedef int_fast32_t int_fast_xor_t;
py::def("xorcpp", xorcpp_pyublas<uint8_t>);
py::def("xorcpp", xorcpp_pyublas<uint16_t>);
py::def("xorcpp", xorcpp_pyublas<uint32_t>);
py::def("xorcpp", xorcpp_pyublas<uint64_t>);
py::def("xorcpp", xorcpp_pyublas<int8_t>);
py::def("xorcpp", xorcpp_pyublas<int16_t>);
py::def("xorcpp", xorcpp_pyublas<int32_t>);
py::def("xorcpp", xorcpp_pyublas<int64_t>);
py::def("xorcpp", xorcpp_str<int_fast_xor_t>);
py::def("xorcpp_inplace", xorcpp_pyublas_inplace<uint8_t>);
py::def("xorcpp_inplace", xorcpp_pyublas_inplace<uint16_t>);
py::def("xorcpp_inplace", xorcpp_pyublas_inplace<uint32_t>);
py::def("xorcpp_inplace", xorcpp_pyublas_inplace<uint64_t>);
py::def("xorcpp_inplace", xorcpp_pyublas_inplace<int8_t>);
py::def("xorcpp_inplace", xorcpp_pyublas_inplace<int16_t>);
py::def("xorcpp_inplace", xorcpp_pyublas_inplace<int32_t>);
py::def("xorcpp_inplace", xorcpp_pyublas_inplace<int64_t>);
py::def("xorcpp_inplace", xorcpp_str_inplace<int_fast_xor_t>);
}
"""See
http://stackoverflow.com/questions/2119761/simple-python-challenge-fastest-bitwise-xor-on-data-buffers
"""
import os
import numpy as np
#from profilestats import profile
from xor_strings import xor_strings, xorcpp, xorcpp_byte, xorcpp_inplace
from marek import inline_xor_nocopy, inline_xor
from xorcy import xor, xor_vectorised
from xorcpp import xorcpp as xorcpp_str, xorcpp_inplace as xorcpp_str_inplace
def slow_xor(aa, bb):
a=np.frombuffer(aa,dtype=np.byte)
b=np.frombuffer(bb,dtype=np.byte)
c=np.bitwise_xor(a,b)
r=c.tostring()
return r
aa = os.urandom(2**20)
bb = os.urandom(2**20)
#@profile
def test():
global aa, bb
a_list = list(aa)
b_list = list(bb)
c = xor_strings(aa, bb, 'int64') # control
c_list = list(c)
def xorcpp_type(type_):
def xorcpp_t(aa, bb):
return xorcpp(aa,bb, type_)
xorcpp_t.__name__='xorcpp_'+type_
return xorcpp_t
def xorcpp_type_inplace(type_):
def xorcpp_inplace_t(aa, bb):
return xorcpp_inplace(aa, bb, type_)
return xorcpp_inplace_t
for func in [xorcpp_str, xorcpp_type('int8'), xorcpp_type('uint64'), xorcpp_type('int64'),
xorcpp_byte, slow_xor, inline_xor, inline_xor_nocopy, xor, xor_vectorised,
xorcpp_type_inplace('int64'),
]:
aa, bb = ''.join(a_list), ''.join(b_list)
assert func(aa, bb) == c, (func.__name__, aa[:5], bb[:5], c[:5])
assert aa == ''.join(a_list) and bb == ''.join(b_list) and c == ''.join(c_list)
cc = xorcpp_str_inplace(aa, bb)
assert aa == ''.join(a_list)
assert bb == ''.join(c_list)
assert cc == ''.join(c_list)
aa, bb = ''.join(a_list), ''.join(b_list)
cc = xorcpp_type_inplace('int8')(aa, bb)
assert aa == ''.join(a_list)
assert bb == ''.join(c_list)
assert cc == ''.join(c_list)
if __name__=="__main__":
test()
import xorf
import xorcpp as xc
import numpy as np
import pyublas
def xor_strings(a, b, type_sig):
a = np.frombuffer(a, dtype=np.dtype(type_sig))
b = np.frombuffer(b, dtype=np.dtype(type_sig))
return getattr(xorf, 'xor_'+type_sig)(a, b).tostring()
def xorcpp_byte(a, b):
a = np.frombuffer(a,dtype=np.byte)
b = np.frombuffer(b,dtype=np.byte)
return xc.xorcpp(a, b).tostring()
def xorcpp(a, b, type_sig):
a = np.frombuffer(a, dtype=np.dtype(type_sig))
b = np.frombuffer(b, dtype=np.dtype(type_sig))
if not a.flags.aligned:
a = a.copy() # align
if not b.flags.aligned:
b = b.copy()
return xc.xorcpp(a, b).tostring()
def xorcpp_inplace(a, b, type_sig):
a = np.frombuffer(a, dtype=np.dtype(type_sig))
b = np.frombuffer(b, dtype=np.dtype(type_sig))
if not a.flags.aligned:
a = a.copy() # align
if not b.flags.aligned:
b = b.copy()
c = xc.xorcpp_inplace(a, b)
return c.tostring()
# http://stackoverflow.com/questions/2119761/simple-python-challenge-fastest-bitwise-xor-on-data-buffers/2174000#2174000
cdef char c[1048576]
def xor(char *a,char *b):
cdef int i
for i in range(1048576):
c[i]=a[i]^b[i]
return c[:1048576]
def xor_vectorised(char *a,char *b):
cdef int i
for i in range(131094):
(<unsigned long long *>c)[i]=(<unsigned long long *>a)[i]^(<unsigned long long *>b)[i]
return c[:1048576]
! xorf.f90.template
subroutine xor_$type_sig(a, b, n, out)
implicit none
integer, intent(in) :: n
$type, intent(in), dimension(n) :: a
$type, intent(in), dimension(n) :: b
$type, intent(out), dimension(n) :: out
out = ieor(a, b)
end subroutine xor_$type_sig
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment