Skip to content

Instantly share code, notes, and snippets.

@kkirsanov
Last active April 25, 2019 06:22
Show Gist options
  • Save kkirsanov/620e02115217cb87d3e74ba5e1357b70 to your computer and use it in GitHub Desktop.
Save kkirsanov/620e02115217cb87d3e74ba5e1357b70 to your computer and use it in GitHub Desktop.
dataframe serialization perfomance
import os
import time
import numpy as np
import pandas as pd
import pandavro
def size(size, precision=2):
suffixes = ['B', 'KB', 'MB', 'GB', 'TB']
suffixIndex = 0
while size > 1024 and suffixIndex < 4:
suffixIndex += 1 # increment the index of the suffix
size = size / 1024.0 # apply the division
return "%.*f%s" % (precision, size, suffixes[suffixIndex])
cnt = 1000000
data = np.arange(cnt)
data2 = np.arange(0, 1, 1.0 / cnt)
data3 = pd.date_range(start='1/1/2018', end='1/08/2018', periods=cnt)
data4 = [pd.util.testing.rands(3) for x in range(cnt)]
df = pd.DataFrame(dict(A=data, B=data2, C=data3, D=data4))
df['C'] = df['C'].astype('datetime64[ms]')
print("format \t|\t time write \t|\t time read \t|\t size")
t0 = time.time()
df.to_csv('demo.csv')
print(f'csv ', end="\t|\t")
print(time.time() - t0, end="\t|\t")
t0 = time.time()
df2 = pd.read_csv('demo.csv')
print(time.time() - t0, end="\t|\t")
print(size(os.path.getsize('demo.csv')))
t0 = time.time()
df.to_json('demo.json')
print(f'json', end="\t|\t")
print(time.time() - t0, end="\t|\t")
t0 = time.time()
df2 = pd.read_json('demo.json')
print(time.time() - t0, end="\t|\t")
print(size(os.path.getsize('demo.json')))
t0 = time.time()
pandavro.to_avro('demo.avro', df)
print(f'avro', end="\t|\t")
print(time.time() - t0, end="\t|\t")
t0 = time.time()
df2 = pandavro.from_avro('demo.avro')
print(time.time() - t0, end="\t|\t")
print(size(os.path.getsize('demo.avro')))
t0 = time.time()
df.to_parquet('demo.parquet', compression=None)
print(f'parquet', end="\t|\t")
print(time.time() - t0, end="\t|\t")
t0 = time.time()
df2 = pd.read_parquet('demo.parquet')
print(time.time() - t0, end="\t|\t")
print(size(os.path.getsize('demo.parquet')))
t0 = time.time()
df.to_pickle('demo.pickle')
print(f'pickle', end="\t|\t")
print(time.time() - t0, end="\t|\t")
t0 = time.time()
df2 = pd.read_pickle('demo.pickle')
print(time.time() - t0, end="\t|\t")
print(size(os.path.getsize('demo.pickle')))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment