Skip to content

Instantly share code, notes, and snippets.

@chris-b1
Created June 12, 2017 16:50
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save chris-b1/94b6116da4a88a884e71f2253af56211 to your computer and use it in GitHub Desktop.
Save chris-b1/94b6116da4a88a884e71f2253af56211 to your computer and use it in GitHub Desktop.
import pandas as pd
import numpy as np
from pyarrow.parquet import read_pandas, write_table
from pyarrow import Table
import timeit
N = 25000
s = np.array(['AAAAAAAA', 'BBBBBBB', 'CCCCCCCC'], dtype='O')
np.random.seed(42)
df = pd.DataFrame({'f1': np.linspace(0, 100, num=N),
'f2': np.linspace(0, 100, num=N),
'f3': np.linspace(0, 100, num=N),
'f4': np.linspace(0, 100, num=N),
'd1': pd.date_range('1900-01-01', periods=N),
's1': np.random.choice(s, N),
's2': np.random.choice(s, N)})
write_cases = [
('Excel - XlsxWriter', 'df.to_excel("tmp.xlsx", index=False, engine="xlsxwriter")'),
('Excel - Openpyxl', 'df.to_excel("tmp.xlsx", index=False, engine="openpyxl")'),
('CSV', 'df.to_csv("tmp.csv", index=False)'),
('Parquet', 'write_table(Table.from_pandas(df, timestamps_to_ms=True), "tmp.pq")')
]
n_runs = 3
results = {}
for label, stmt in write_cases:
t = timeit.timeit(stmt, globals=globals(), number=n_runs) / n_runs * 1000
results[label] = t
writes = pd.Series(results, name='Write Time(ms)')
read_cases = [
('Excel - Xlrd', 'pd.read_excel("tmp.xlsx", engine="xlrd")'),
('CSV', 'pd.read_csv("tmp.csv")'),
('Parquet', 'read_pandas("tmp.pq")')
]
results = {}
for label, stmt in read_cases:
t = timeit.timeit(stmt, globals=globals(), number=n_runs) / n_runs * 1000
results[label] = t
reads = pd.Series(results, name='Read Time(ms)')
df = pd.concat([writes, reads], axis=1)
with pd.option_context('precision', 2):
print(df)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment