Last active
September 26, 2017 03:53
-
-
Save rvernica/354ad9057bdec44ca5471a1b8eec6765 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# AFL% set no fetch; store(apply(build(<x:int64>[i=1:1000; j=1:1000], i + j * 1000), y, 1e6 / double(i + 1), z, string(i)), df); set fetch; | |
# Query was executed successfully | |
# AFL% summarize(df); | |
# {inst,attid} att,count,bytes,chunks,min_count,avg_count,max_count,min_bytes,avg_bytes,max_bytes | |
# {0,0} 'all',1000000,8041133,4,1000000,1e+06,1000000,48,2.01028e+06,8000072 | |
# In [17]: sys.getsizeof(df)/1024/1024. | |
# Out[17]: 87.70475769042969 MB | |
# In [15]: df.dtypes | |
# Out[15]: | |
# i int64 | |
# j int64 | |
# x float64 | |
# y float64 | |
# z object | |
# dtype: object | |
# 35M Aug 21 19:24 df.npy | |
# 37M Aug 21 19:02 df.pkl | |
# 38M Aug 21 19:03 df.feather | |
# 38M Sep 25 19:37 df.arrow | |
# In [8]: %timeit test_numpy_write_npy(df) | |
# 2.1 s ± 35.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) | |
# In [9]: 35 / 2.1 | |
# Out[9]: 16.666666666666664 MB/s | |
# In [10]: %timeit test_pickle_write(df) | |
# 510 ms ± 6.84 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) | |
# In [11]: 37 / .510 | |
# Out[11]: 72.54901960784314 MB/s | |
# In [4]: %timeit test_feather_write(df) | |
# 336 ms ± 17.2 ms per loop (mean ± std. dev. of 7 runs, 10 loops each) | |
# In [5]: 38 / .336 | |
# Out[5]: 113.09523809523809 MB/s | |
# In [6]: %timeit test_arrow_stream_write(df) | |
# 254 ms ± 34.2 ms per loop (mean ± std. dev. of 7 runs, 10 loops each) | |
# In [7]: 38 / .254 | |
# Out[7]: 149.60629921259843 MB/s | |
# In [12]: %timeit test_numpy_read_npy() | |
# 1.18 s ± 3.58 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) | |
# In [14]: 35 / 1.18 | |
# Out[14]: 29.661016949152543 MB/s | |
# In [13]: %timeit test_pickle_read() | |
# 121 ms ± 1.97 ms per loop (mean ± std. dev. of 7 runs, 10 loops each) | |
# In [15]: 37 / .121 | |
# Out[15]: 305.78512396694214 MB/s | |
# In [16]: %timeit test_feather_read() | |
# 67.1 ms ± 2.83 ms per loop (mean ± std. dev. of 7 runs, 10 loops each) | |
# In [17]: 38 / .0671 | |
# Out[17]: 566.3189269746646 MB/s | |
# In [18]: %timeit test_arrow_stream_read() | |
# 70.5 ms ± 788 µs per loop (mean ± std. dev. of 7 runs, 10 loops each) | |
# In [19]: 38 / .0705 | |
# Out[19]: 539.0070921985816 MB/s | |
import numpy | |
import pandas | |
import pyarrow | |
import scidbpy | |
import sys | |
def read_scidb(): | |
df = scidbpy.connect().arrays.df.fetch(as_dataframe=True) | |
return df | |
def read_pickle(): | |
return pandas.read_pickle('df.pkl') | |
def test_pickle_write(df): | |
df.to_pickle('df.pkl') | |
def test_pickle_read(): | |
pandas.read_pickle('df.pkl') | |
def test_feather_write(df): | |
df.to_feather('df.feather') | |
def test_feather_read(): | |
pandas.read_feather('df.feather') | |
def test_numpy_write_npy(df): | |
numpy.save('df.npy', df.to_records()) | |
def test_numpy_read_npy(): | |
numpy.load('df.npy') | |
def test_arrow_stream_write(df): | |
batch = pyarrow.RecordBatch.from_pandas(df, preserve_index=False) | |
pyarrow.RecordBatchStreamWriter( | |
pyarrow.OSFile('df.arrow', 'wb'), batch.schema).write_batch(batch) | |
def test_arrow_stream_read(): | |
pyarrow.RecordBatchStreamReader( | |
pyarrow.OSFile('df.arrow', 'rb')).read_pandas() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment