Skip to content

Instantly share code, notes, and snippets.

@zed

zed/.gitignore Secret

Created September 24, 2011 19:43
Show Gist options
  • Star 4 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save zed/da57326584a3811652aa to your computer and use it in GitHub Desktop.
Save zed/da57326584a3811652aa to your computer and use it in GitHub Desktop.
Python group by array a, and summarize array b - Performance
/build/
/pdf.c
/pdf.so
/pdf_hash.so
/pdf_hash.cpp
.PHONY: all test clean measure_pyx measure_bincount measure_slow measure_so measure_hash
NAME = pdf
all: test measure_pyx measure_bincount measure_hash
test: $(NAME).so test_f.py
python test_f.py
measure_pyx: $(NAME).so test_f.py
python -mtimeit -s'from test_f import a,b;' 'x,y=a.copy(),b.copy();'
python -mtimeit -s'from test_f import a,b; from pdf import f' 'x,y=a.copy(),b.copy(); f(x,y);'
measure_hash: pdf_hash.so test_f.py
python -mtimeit -s'from test_f import a,b; from pdf_hash import f' 'x,y=a.copy(),b.copy(); f(x,y);'
measure_bincount: test_f.py
python -mtimeit -s'from test_f import a,b, f_bincount as f' 'f(a,b)'
measure_slow: test_f.py
python -mtimeit -s'from test_f import a,b, f_slow as f' 'f(a,b)'
measure_so:
python -mtimeit -s'from test_f import a,b, approach_Pablo as f' 'f(a,b)'
python -mtimeit -s'from test_f import a,b, approach_2 as f' 'f(a,b)'
python -mtimeit -s'from test_f import a,b, approach_1 as f' 'f(a,b)'
$(NAME).so pdf_hash.so: $(NAME).pyx pdf_hash.pyx setup.py
python setup.py build_ext -i
clean:
git clean -x -d -f

np.random.uniform(1, 10, size=10000)

functiontime, msecratiosource
copy0.01381884.1numpy
f_pyx1.0225.5[2]
f_bincount1.8713.9[2]
f_hash2.0412.7[2]
approach_Pablo18.11.4[1]
f_slow261.0[2]
approach_2630.4[1]
approach_19150.0[1]

np.random.randint(1, 10, 10000)

functiontime, msecratiosource
copy0.0135829.6numpy
f_hash0.21651.9[2]
unique_Unutbu0.37230.1[1]
f_pyx0.53321.0[2]
f_bincount1.288.8[2]
approach_14.602.4[1]
approach_Pablo6.831.6[1]
f_slow11.21.0[2]
approach_249.80.2[1]

[1] the question

[2] this answer

import numpy as np
cimport numpy as np
ctypedef np.int_t dtype1_t
ctypedef np.float_t dtype2_t
def f(np.ndarray[dtype1_t,ndim=1] a, np.ndarray[dtype2_t,ndim=1] b):
assert a.shape[0] and a.shape[0] == b.shape[0]
cdef:
np.ndarray[Py_ssize_t,ndim=1] ndx = np.argsort(a)
Py_ssize_t i, pos = -1
dtype1_t val = -1 # choose val not in a
a[:] = a[ndx]; b[:] = b[ndx] # sort arrays inplace
for i in range(a.shape[0]):
if a[i] != val:
pos += 1
a[pos] = val = a[i] # save unique
b[pos] = b[i] # start count
else:
b[pos] += b[i]
return a[:pos+1], b[:pos+1]
# dereference and increment operators
from cython.operator cimport dereference as deref, preincrement as inc
from unordered_map cimport unordered_map
cimport numpy as np
ctypedef np.int_t dtype1_t
ctypedef np.float_t dtype2_t
def f(np.ndarray[dtype1_t,ndim=1] a, np.ndarray[dtype2_t,ndim=1] b):
assert a.shape[0] and a.shape[0] == b.shape[0]
# sum
cdef unordered_map[dtype1_t,dtype2_t] hashtable
cdef Py_ssize_t i
for i in range(a.shape[0]):
hashtable[a[i]] += b[i]
# copy to numpy arrays
cdef unordered_map[dtype1_t,dtype2_t].iterator it = hashtable.begin()
i = 0
while it != hashtable.end():
a[i] = deref(it).first
b[i] = deref(it).second
inc(it)
i += 1
return a[:i], b[:i]
from distutils.core import setup
from distutils.extension import Extension
from Cython.Distutils import build_ext
setup(ext_modules=[
Extension("pdf",["pdf.pyx"]),
Extension("pdf_hash", ["pdf_hash.pyx"], language="c++",
extra_compile_args=["-std=c++0x"],
),
],
cmdclass={'build_ext': build_ext})
#!/usr/bin/env python
from collections import defaultdict
from itertools import groupby
import numpy as np
from pdf import f as f_pyx
from pdf_hash import f as f_hash
def f_slow(a, b):
ndx = np.argsort(a)
val = None
pos = 0
a_result, b_result = [], []
for i in ndx:
if a[i] != val:
val = a[i]
a_result.append(val)
b_result.append(0)
b_result[-1] += b[i]
return a_result, b_result
def f_bincount(a, b):
result_a, inv_ndx = np.unique(a, return_inverse=True)
result_b = np.bincount(inv_ndx, weights=b)
return result_a, result_b
# http://stackoverflow.com/questions/7538382/python-group-by-array-a-and-summarize-array-b-performance
def approach_1(a,b):
bResult = [sum(b[i == a]) for i in np.unique(a)]
aResult = np.unique(a)
return aResult, bResult
def approach_2(a,b):
tmp = [(a[i],b[i]) for i in range(len(a))]
tmp2 = np.array(tmp, dtype = [('a', float),('b', float)])
tmp2 = np.sort(tmp2, order='a')
bResult = []
aResult = []
for key, group in groupby(tmp2, lambda x: x[0]):
aResult.append(key)
bResult.append(sum([i[1] for i in group]))
return aResult, bResult
def approach_Pablo(a,b):
pdf = defaultdict(int);
for x,y in zip(a,b):
pdf[x] += y
return pdf.keys(), pdf.values()
def unique_Unutbu(a,b):
x=np.bincount(a,weights=b)
aResult = np.unique(a)
bResult = x[aResult]
return aResult, bResult
a = np.random.randint(1,10,size=10000)
b = np.array([1./len(a)]*len(a),dtype=np.float)
def test():
x = np.array([7,3,5,7,5,7], dtype=np.int)
y = np.array([0.2,0.1,0.3,0.1,0.1,0.2], dtype=np.float)
# [7,3,5], [0.5, 0.1, 0.4]
for f in [f_slow, f_bincount,
approach_Pablo,
approach_2,
approach_1,
f_pyx,
f_hash,
]:
xx, yy = x.copy(), y.copy()
xx, yy = map(sorted, f(xx, yy))
assert np.allclose(xx, [3, 5, 7])
assert np.allclose(yy, [0.1, 0.4, 0.5]), (f.__name__, yy)
if __name__=="__main__":
test()
# modified libcpp.map
from libcpp.utility cimport pair
cdef extern from "<unordered_map>" namespace "std":
cdef cppclass unordered_map[T, U]:
cppclass iterator:
pair[T,U]& operator*()
iterator operator++()
iterator operator--()
bint operator==(iterator)
bint operator!=(iterator)
cppclass reverse_iterator:
pair[T,U]& operator*()
iterator operator++()
iterator operator--()
bint operator==(reverse_iterator)
bint operator!=(reverse_iterator)
#cppclass const_iterator(iterator):
# pass
#cppclass const_reverse_iterator(reverse_iterator):
# pass
unordered_map()
unordered_map(unordered_map&)
#unordered_map(key_compare&)
U& operator[](T&)
#unordered_map& operator=(unordered_map&)
bint operator==(unordered_map&, unordered_map&)
bint operator!=(unordered_map&, unordered_map&)
bint operator<(unordered_map&, unordered_map&)
bint operator>(unordered_map&, unordered_map&)
bint operator<=(unordered_map&, unordered_map&)
bint operator>=(unordered_map&, unordered_map&)
U& at(T&)
iterator begin()
#const_iterator begin()
void clear()
size_t count(T&)
bint empty()
iterator end()
#const_iterator end()
pair[iterator, iterator] equal_range(T&)
#pair[const_iterator, const_iterator] equal_range(key_type&)
void erase(iterator)
void erase(iterator, iterator)
size_t erase(T&)
iterator find(T&)
#const_iterator find(key_type&)
pair[iterator, bint] insert(pair[T,U]) # XXX pair[T,U]&
iterator insert(iterator, pair[T,U]) # XXX pair[T,U]&
#void insert(input_iterator, input_iterator)
#key_compare key_comp()
iterator lower_bound(T&)
#const_iterator lower_bound(key_type&)
size_t max_size()
reverse_iterator rbegin()
#const_reverse_iterator rbegin()
reverse_iterator rend()
#const_reverse_iterator rend()
size_t size()
void swap(unordered_map&)
iterator upper_bound(T&)
#const_iterator upper_bound(key_type&)
#value_compare value_comp()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment