Skip to content

Instantly share code, notes, and snippets.

@wphicks
Forked from Erotemic/benchmark_pandas.py
Last active October 18, 2018 15:15
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save wphicks/2fb62b19e2262f922fc066a34d38776e to your computer and use it in GitHub Desktop.
Save wphicks/2fb62b19e2262f922fc066a34d38776e to your computer and use it in GitHub Desktop.
def benchmark_pandas():
import timerit
import pandas as pd
import numpy as np
column_data = {
key: np.random.rand(1000)
for key in map(chr, range(ord('a'), ord('z')))
}
data_frame = pd.DataFrame(column_data)
print('\n-----')
for timer in timerit.Timerit(
100, bestof=10, label='Row Access (PANDAS.loc)'):
with timer:
for i in range(len(data_frame)):
data_frame.loc[i].mean()
# Timed best=158.0 ms, mean=159.2 ± 0.7 ms for Row Access (PANDAS.loc)
for timer in timerit.Timerit(
100, bestof=10, label='Row Access (PANDAS.iloc)'):
with timer:
for i in range(len(data_frame)):
data_frame.iloc[i].mean()
# Timed best=138.7 ms, mean=139.2 ± 0.43 ms for Row Access (PANDAS.iloc)
for timer in timerit.Timerit(
100, bestof=10, label='Row Access (PANDAS.iterrows)'):
with timer:
for row in data_frame.iterrows():
np.mean(row[1])
# Timed best=86.01 ms, mean=86.67 ± 0.26 ms for Row Access
# (PANDAS.iterrows)
for timer in timerit.Timerit(100, bestof=10, label='Row Access (DICT)'):
with timer:
np.mean([
[column_data[k][i] for k in column_data.keys()]
for i in range(len(data_frame))
])
# Timed best=3.67 ms, mean=3.71 ± 0.022 ms for Row Access (DICT)
for timer in timerit.Timerit(
100, bestof=10, label='Row Access (PANDAS.apply)'):
with timer:
data_frame.apply(lambda row: row.mean())
# Timed best=2.151 ms, mean=2.171 ± 0.023 ms for Row Access (PANDAS.apply)
for timer in timerit.Timerit(
100, bestof=10, label='Row Access (PANDAS.mean)'):
with timer:
data_frame.mean()
# Timed best=189.2 µs, mean=190.7 ± 1.1 µs for Row Access (PANDAS.mean)
for timer in timerit.Timerit(
100, bestof=10, label='Row Access (PANDAS.values)'):
with timer:
data_frame.values.mean(0)
# Timed best=21.55 µs, mean=21.83 ± 0.46 µs for Row Access (PANDAS.values)
print('\n-----')
for timer in timerit.Timerit(
100, bestof=10, label='Column Access (PANDAS)'):
with timer:
for k in data_frame.keys():
data_frame[k].mean()
# Timed best=1.497 ms, mean=1.507 ± 0.0056 ms for Column Access (PANDAS)
for timer in timerit.Timerit(100, bestof=10, label='Column Access (DICT)'):
with timer:
for k in column_data.keys():
column_data[k].mean()
# Timed best=105.0 µs, mean=106.4 ± 0.85 µs for Column Access (DICT)
for timer in timerit.Timerit(
100, bestof=10, label='Column Access (PANDAS.apply)'):
with timer:
data_frame.apply(lambda col: col.mean(), axis=1)
# Timed best=62.63 ms, mean=63.06 ± 0.22 ms for Column Access
# (PANDAS.apply)
for timer in timerit.Timerit(
100, bestof=10, label='Column Access (PANDAS.mean)'):
with timer:
data_frame.mean(1)
# Timed best=202.2 µs, mean=208.6 ± 5.8 µs for Column Access (PANDAS.mean)
for timer in timerit.Timerit(
100, bestof=10, label='Column Access (PANDAS.values)'):
with timer:
data_frame.values.mean(1)
# Timed best=23.27 µs, mean=24.87 ± 4.4 µs for Column Access
# (PANDAS.values)
if __name__ == '__main__':
benchmark_pandas()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment