|
#!/usr/bin/env python |
|
from collections import defaultdict |
|
from itertools import groupby |
|
import numpy as np |
|
from pdf import f as f_pyx |
|
from pdf_hash import f as f_hash |
|
|
|
|
|
def f_slow(a, b): |
|
ndx = np.argsort(a) |
|
val = None |
|
pos = 0 |
|
a_result, b_result = [], [] |
|
for i in ndx: |
|
if a[i] != val: |
|
val = a[i] |
|
a_result.append(val) |
|
b_result.append(0) |
|
b_result[-1] += b[i] |
|
return a_result, b_result |
|
|
|
def f_bincount(a, b): |
|
result_a, inv_ndx = np.unique(a, return_inverse=True) |
|
result_b = np.bincount(inv_ndx, weights=b) |
|
return result_a, result_b |
|
|
|
# http://stackoverflow.com/questions/7538382/python-group-by-array-a-and-summarize-array-b-performance |
|
def approach_1(a,b): |
|
bResult = [sum(b[i == a]) for i in np.unique(a)] |
|
aResult = np.unique(a) |
|
return aResult, bResult |
|
|
|
def approach_2(a,b): |
|
tmp = [(a[i],b[i]) for i in range(len(a))] |
|
tmp2 = np.array(tmp, dtype = [('a', float),('b', float)]) |
|
tmp2 = np.sort(tmp2, order='a') |
|
|
|
bResult = [] |
|
aResult = [] |
|
for key, group in groupby(tmp2, lambda x: x[0]): |
|
aResult.append(key) |
|
bResult.append(sum([i[1] for i in group])) |
|
return aResult, bResult |
|
|
|
def approach_Pablo(a,b): |
|
pdf = defaultdict(int); |
|
for x,y in zip(a,b): |
|
pdf[x] += y |
|
return pdf.keys(), pdf.values() |
|
|
|
def unique_Unutbu(a,b): |
|
x=np.bincount(a,weights=b) |
|
aResult = np.unique(a) |
|
bResult = x[aResult] |
|
return aResult, bResult |
|
|
|
a = np.random.randint(1,10,size=10000) |
|
b = np.array([1./len(a)]*len(a),dtype=np.float) |
|
|
|
def test(): |
|
x = np.array([7,3,5,7,5,7], dtype=np.int) |
|
y = np.array([0.2,0.1,0.3,0.1,0.1,0.2], dtype=np.float) |
|
|
|
# [7,3,5], [0.5, 0.1, 0.4] |
|
|
|
for f in [f_slow, f_bincount, |
|
approach_Pablo, |
|
approach_2, |
|
approach_1, |
|
f_pyx, |
|
f_hash, |
|
]: |
|
xx, yy = x.copy(), y.copy() |
|
xx, yy = map(sorted, f(xx, yy)) |
|
assert np.allclose(xx, [3, 5, 7]) |
|
assert np.allclose(yy, [0.1, 0.4, 0.5]), (f.__name__, yy) |
|
|
|
if __name__=="__main__": |
|
test() |
|
|
|
|