secret
Last active

Python group by array a, and summarize array b - Performance

  • Download Gist
.gitignore
1 2 3 4 5
/build/
/pdf.c
/pdf.so
/pdf_hash.so
/pdf_hash.cpp
Makefile
Makefile
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33
.PHONY: all test clean measure_pyx measure_bincount measure_slow measure_so measure_hash
 
NAME = pdf
 
all: test measure_pyx measure_bincount measure_hash
 
 
test: $(NAME).so test_f.py
python test_f.py
 
measure_pyx: $(NAME).so test_f.py
python -mtimeit -s'from test_f import a,b;' 'x,y=a.copy(),b.copy();'
python -mtimeit -s'from test_f import a,b; from pdf import f' 'x,y=a.copy(),b.copy(); f(x,y);'
 
measure_hash: pdf_hash.so test_f.py
python -mtimeit -s'from test_f import a,b; from pdf_hash import f' 'x,y=a.copy(),b.copy(); f(x,y);'
 
measure_bincount: test_f.py
python -mtimeit -s'from test_f import a,b, f_bincount as f' 'f(a,b)'
 
measure_slow: test_f.py
python -mtimeit -s'from test_f import a,b, f_slow as f' 'f(a,b)'
 
measure_so:
python -mtimeit -s'from test_f import a,b, approach_Pablo as f' 'f(a,b)'
python -mtimeit -s'from test_f import a,b, approach_2 as f' 'f(a,b)'
python -mtimeit -s'from test_f import a,b, approach_1 as f' 'f(a,b)'
 
$(NAME).so pdf_hash.so: $(NAME).pyx pdf_hash.pyx setup.py
python setup.py build_ext -i
 
clean:
git clean -x -d -f
measurements.org
Org

np.random.uniform(1, 10, size=10000)

function time, msec ratio source
copy 0.0138 1884.1 numpy
f_pyx 1.02 25.5 [2]
f_bincount 1.87 13.9 [2]
f_hash 2.04 12.7 [2]
approach_Pablo 18.1 1.4 [1]
f_slow 26 1.0 [2]
approach_2 63 0.4 [1]
approach_1 915 0.0 [1]

np.random.randint(1, 10, 10000)

function time, msec ratio source
copy 0.0135 829.6 numpy
f_hash 0.216 51.9 [2]
unique_Unutbu 0.372 30.1 [1]
f_pyx 0.533 21.0 [2]
f_bincount 1.28 8.8 [2]
approach_1 4.60 2.4 [1]
approach_Pablo 6.83 1.6 [1]
f_slow 11.2 1.0 [2]
approach_2 49.8 0.2 [1]

[1] the question

[2] this answer

pdf.pyx
Cython
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21
import numpy as np
cimport numpy as np
 
ctypedef np.int_t dtype1_t
ctypedef np.float_t dtype2_t
 
def f(np.ndarray[dtype1_t,ndim=1] a, np.ndarray[dtype2_t,ndim=1] b):
assert a.shape[0] and a.shape[0] == b.shape[0]
cdef:
np.ndarray[Py_ssize_t,ndim=1] ndx = np.argsort(a)
Py_ssize_t i, pos = -1
dtype1_t val = -1 # choose val not in a
a[:] = a[ndx]; b[:] = b[ndx] # sort arrays inplace
for i in range(a.shape[0]):
if a[i] != val:
pos += 1
a[pos] = val = a[i] # save unique
b[pos] = b[i] # start count
else:
b[pos] += b[i]
return a[:pos+1], b[:pos+1]
pdf_hash.pyx
Cython
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26
# dereference and increment operators
from cython.operator cimport dereference as deref, preincrement as inc
from unordered_map cimport unordered_map
 
cimport numpy as np
 
ctypedef np.int_t dtype1_t
ctypedef np.float_t dtype2_t
 
def f(np.ndarray[dtype1_t,ndim=1] a, np.ndarray[dtype2_t,ndim=1] b):
assert a.shape[0] and a.shape[0] == b.shape[0]
# sum
cdef unordered_map[dtype1_t,dtype2_t] hashtable
cdef Py_ssize_t i
for i in range(a.shape[0]):
hashtable[a[i]] += b[i]
# copy to numpy arrays
cdef unordered_map[dtype1_t,dtype2_t].iterator it = hashtable.begin()
i = 0
while it != hashtable.end():
a[i] = deref(it).first
b[i] = deref(it).second
inc(it)
i += 1
return a[:i], b[:i]
setup.py
Python
1 2 3 4 5 6 7 8 9 10 11
from distutils.core import setup
from distutils.extension import Extension
from Cython.Distutils import build_ext
 
setup(ext_modules=[
Extension("pdf",["pdf.pyx"]),
Extension("pdf_hash", ["pdf_hash.pyx"], language="c++",
extra_compile_args=["-std=c++0x"],
),
],
cmdclass={'build_ext': build_ext})
test_f.py
Python
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79
#!/usr/bin/env python
from collections import defaultdict
from itertools import groupby
import numpy as np
from pdf import f as f_pyx
from pdf_hash import f as f_hash
 
 
def f_slow(a, b):
ndx = np.argsort(a)
val = None
pos = 0
a_result, b_result = [], []
for i in ndx:
if a[i] != val:
val = a[i]
a_result.append(val)
b_result.append(0)
b_result[-1] += b[i]
return a_result, b_result
 
def f_bincount(a, b):
result_a, inv_ndx = np.unique(a, return_inverse=True)
result_b = np.bincount(inv_ndx, weights=b)
return result_a, result_b
 
# http://stackoverflow.com/questions/7538382/python-group-by-array-a-and-summarize-array-b-performance
def approach_1(a,b):
bResult = [sum(b[i == a]) for i in np.unique(a)]
aResult = np.unique(a)
return aResult, bResult
 
def approach_2(a,b):
tmp = [(a[i],b[i]) for i in range(len(a))]
tmp2 = np.array(tmp, dtype = [('a', float),('b', float)])
tmp2 = np.sort(tmp2, order='a')
 
bResult = []
aResult = []
for key, group in groupby(tmp2, lambda x: x[0]):
aResult.append(key)
bResult.append(sum([i[1] for i in group]))
return aResult, bResult
 
def approach_Pablo(a,b):
pdf = defaultdict(int);
for x,y in zip(a,b):
pdf[x] += y
return pdf.keys(), pdf.values()
 
def unique_Unutbu(a,b):
x=np.bincount(a,weights=b)
aResult = np.unique(a)
bResult = x[aResult]
return aResult, bResult
 
a = np.random.randint(1,10,size=10000)
b = np.array([1./len(a)]*len(a),dtype=np.float)
 
def test():
x = np.array([7,3,5,7,5,7], dtype=np.int)
y = np.array([0.2,0.1,0.3,0.1,0.1,0.2], dtype=np.float)
 
# [7,3,5], [0.5, 0.1, 0.4]
 
for f in [f_slow, f_bincount,
approach_Pablo,
approach_2,
approach_1,
f_pyx,
f_hash,
]:
xx, yy = x.copy(), y.copy()
xx, yy = map(sorted, f(xx, yy))
assert np.allclose(xx, [3, 5, 7])
assert np.allclose(yy, [0.1, 0.4, 0.5]), (f.__name__, yy)
 
if __name__=="__main__":
test()
unordered_map.pxd
Cython
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62
# modified libcpp.map
from libcpp.utility cimport pair
cdef extern from "<unordered_map>" namespace "std":
cdef cppclass unordered_map[T, U]:
cppclass iterator:
pair[T,U]& operator*()
iterator operator++()
iterator operator--()
bint operator==(iterator)
bint operator!=(iterator)
cppclass reverse_iterator:
pair[T,U]& operator*()
iterator operator++()
iterator operator--()
bint operator==(reverse_iterator)
bint operator!=(reverse_iterator)
#cppclass const_iterator(iterator):
# pass
#cppclass const_reverse_iterator(reverse_iterator):
# pass
unordered_map()
unordered_map(unordered_map&)
#unordered_map(key_compare&)
U& operator[](T&)
#unordered_map& operator=(unordered_map&)
bint operator==(unordered_map&, unordered_map&)
bint operator!=(unordered_map&, unordered_map&)
bint operator<(unordered_map&, unordered_map&)
bint operator>(unordered_map&, unordered_map&)
bint operator<=(unordered_map&, unordered_map&)
bint operator>=(unordered_map&, unordered_map&)
U& at(T&)
iterator begin()
#const_iterator begin()
void clear()
size_t count(T&)
bint empty()
iterator end()
#const_iterator end()
pair[iterator, iterator] equal_range(T&)
#pair[const_iterator, const_iterator] equal_range(key_type&)
void erase(iterator)
void erase(iterator, iterator)
size_t erase(T&)
iterator find(T&)
#const_iterator find(key_type&)
pair[iterator, bint] insert(pair[T,U]) # XXX pair[T,U]&
iterator insert(iterator, pair[T,U]) # XXX pair[T,U]&
#void insert(input_iterator, input_iterator)
#key_compare key_comp()
iterator lower_bound(T&)
#const_iterator lower_bound(key_type&)
size_t max_size()
reverse_iterator rbegin()
#const_reverse_iterator rbegin()
reverse_iterator rend()
#const_reverse_iterator rend()
size_t size()
void swap(unordered_map&)
iterator upper_bound(T&)
#const_iterator upper_bound(key_type&)
#value_compare value_comp()

Please sign in to comment on this gist.

Something went wrong with that request. Please try again.