Skip to content

Instantly share code, notes, and snippets.

@BodonFerenc
Last active June 24, 2018 18:51
Show Gist options
  • Save BodonFerenc/328628f67217955e30e56b02e670259f to your computer and use it in GitHub Desktop.
Save BodonFerenc/328628f67217955e30e56b02e670259f to your computer and use it in GitHub Desktop.
Data aggregation elapsed time in Python and Q. See article https://www.linkedin.com/pulse/python-data-analysis-really-simple-ferenc-bodon-ph-d-/
rowCount: 100*1000
t: ([] bucket: string rowCount?`2; qty: rowCount?100; risk: rowCount?10; weight: rowCount?2.)
\t do[100; select NR: count i, TOTAL_QTY: sum qty, AVG_QTY: avg qty, TOTAL_RISK: sum risk, AVG_QTY: avg risk, W_AVG_QTY: weight wavg qty, W_AVG_RISK: weight wavg risk by bucket from t]
rowCount: 100*1000
t: ([] bucket: rowCount?`2; qty: rowCount?100; risk: rowCount?10; weight: rowCount?2.)
\t do[100; select NR: count i, TOTAL_QTY: sum qty, AVG_QTY: avg qty, TOTAL_RISK: sum risk, AVG_QTY: avg risk, W_AVG_QTY: weight wavg qty, W_AVG_RISK: weight wavg risk by bucket from t]
import timeit
createTable = '''
import pandas as pd
import numpy as np
import random
import string
rowCount = 100*1000
t = pd.DataFrame({'bucket': [''.join(random.choices(string.ascii_lowercase, k=2)) for _ in range(rowCount)],
'weight': [random.uniform(0, 2) for _ in range(rowCount)],
'qty': [random.randint(0, 100) for _ in range(rowCount)],
'risk': [random.randint(0, 10) for _ in range(rowCount)]})
'''
aggrSingleExpression = '''
def my_agg(x):
data = {'NR': x.bucket.count(),
'TOTAL_QTY': x.qty.sum(),
'AVG_QTY': x.qty.mean(),
'TOTAL_RISK': x.risk.sum(),
'AVG_RISK': x.risk.mean(),
'W_AVG_QTY': np.average(x.qty, weights=x.weight),
'W_AVG_RISK': np.average(x.risk, weights=x.weight)
}
return pd.Series(data, index=['NR', 'TOTAL_QTY', 'AVG_QTY', 'TOTAL_RISK',
'AVG_RISK', 'W_AVG_QTY', 'W_AVG_RISK'])
t.groupby('bucket').apply(my_agg)
'''
print("Python single expression:", timeit.timeit(setup=createTable, stmt=aggrSingleExpression, number = 100))
import timeit
createTable = '''
import pandas as pd
import numpy as np
import random
import string
rowCount = 100*1000
t = pd.DataFrame({'bucket': [''.join(random.choices(string.ascii_lowercase, k=2)) for _ in range(rowCount)],
'weight': [random.uniform(0, 2) for _ in range(rowCount)],
'qty': [random.randint(0, 100) for _ in range(rowCount)],
'risk': [random.randint(0, 10) for _ in range(rowCount)]})
'''
aggrWithJoin = '''
res = t.groupby('bucket').agg({'bucket': len, 'qty': [sum, np.mean], 'risk': [sum, np.mean]})
res.columns = res.columns.map('_'.join)
res.rename(columns={'bucket_len': 'NR', 'qty_sum': 'TOTAL_QTY', 'qty_mean': 'AVG_QTY',
'risk_sum': 'TOTAL_RISK', 'risk_mean': 'AVG_RISK'}).join(
t.groupby('bucket').apply(lambda g: np.average(g.qty, weights=g.weight)).to_frame('W_AVG_QTY')).join(
t.groupby('bucket').apply(lambda g: np.average(g.risk, weights=g.weight)).to_frame('W_AVG_RISK')
)
'''
print("Python with Join 1:", timeit.timeit(setup=createTable, stmt=aggrWithJoin, number = 100))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment