Skip to content

Instantly share code, notes, and snippets.

@rvernica
Last active September 26, 2017 03:53
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save rvernica/354ad9057bdec44ca5471a1b8eec6765 to your computer and use it in GitHub Desktop.
Save rvernica/354ad9057bdec44ca5471a1b8eec6765 to your computer and use it in GitHub Desktop.
# AFL% set no fetch; store(apply(build(<x:int64>[i=1:1000; j=1:1000], i + j * 1000), y, 1e6 / double(i + 1), z, string(i)), df); set fetch;
# Query was executed successfully
# AFL% summarize(df);
# {inst,attid} att,count,bytes,chunks,min_count,avg_count,max_count,min_bytes,avg_bytes,max_bytes
# {0,0} 'all',1000000,8041133,4,1000000,1e+06,1000000,48,2.01028e+06,8000072
# In [17]: sys.getsizeof(df)/1024/1024.
# Out[17]: 87.70475769042969 MB
# In [15]: df.dtypes
# Out[15]:
# i int64
# j int64
# x float64
# y float64
# z object
# dtype: object
# 35M Aug 21 19:24 df.npy
# 37M Aug 21 19:02 df.pkl
# 38M Aug 21 19:03 df.feather
# 38M Sep 25 19:37 df.arrow
# In [8]: %timeit test_numpy_write_npy(df)
# 2.1 s ± 35.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
# In [9]: 35 / 2.1
# Out[9]: 16.666666666666664 MB/s
# In [10]: %timeit test_pickle_write(df)
# 510 ms ± 6.84 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
# In [11]: 37 / .510
# Out[11]: 72.54901960784314 MB/s
# In [4]: %timeit test_feather_write(df)
# 336 ms ± 17.2 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
# In [5]: 38 / .336
# Out[5]: 113.09523809523809 MB/s
# In [6]: %timeit test_arrow_stream_write(df)
# 254 ms ± 34.2 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
# In [7]: 38 / .254
# Out[7]: 149.60629921259843 MB/s
# In [12]: %timeit test_numpy_read_npy()
# 1.18 s ± 3.58 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
# In [14]: 35 / 1.18
# Out[14]: 29.661016949152543 MB/s
# In [13]: %timeit test_pickle_read()
# 121 ms ± 1.97 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
# In [15]: 37 / .121
# Out[15]: 305.78512396694214 MB/s
# In [16]: %timeit test_feather_read()
# 67.1 ms ± 2.83 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
# In [17]: 38 / .0671
# Out[17]: 566.3189269746646 MB/s
# In [18]: %timeit test_arrow_stream_read()
# 70.5 ms ± 788 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
# In [19]: 38 / .0705
# Out[19]: 539.0070921985816 MB/s
import numpy
import pandas
import pyarrow
import scidbpy
import sys
def read_scidb():
df = scidbpy.connect().arrays.df.fetch(as_dataframe=True)
return df
def read_pickle():
return pandas.read_pickle('df.pkl')
def test_pickle_write(df):
df.to_pickle('df.pkl')
def test_pickle_read():
pandas.read_pickle('df.pkl')
def test_feather_write(df):
df.to_feather('df.feather')
def test_feather_read():
pandas.read_feather('df.feather')
def test_numpy_write_npy(df):
numpy.save('df.npy', df.to_records())
def test_numpy_read_npy():
numpy.load('df.npy')
def test_arrow_stream_write(df):
batch = pyarrow.RecordBatch.from_pandas(df, preserve_index=False)
pyarrow.RecordBatchStreamWriter(
pyarrow.OSFile('df.arrow', 'wb'), batch.schema).write_batch(batch)
def test_arrow_stream_read():
pyarrow.RecordBatchStreamReader(
pyarrow.OSFile('df.arrow', 'rb')).read_pandas()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment