rvernica/perf.py

## perf.py
# AFL% set no fetch; store(apply(build(<x:int64>[i=1:1000; j=1:1000], i + j * 1000), y, 1e6 / double(i + 1), z, string(i)), df); set fetch;
# Query was executed successfully
# AFL% summarize(df);
# {inst,attid} att,count,bytes,chunks,min_count,avg_count,max_count,min_bytes,avg_bytes,max_bytes
# {0,0} 'all',1000000,8041133,4,1000000,1e+06,1000000,48,2.01028e+06,8000072


# In [17]: sys.getsizeof(df)/1024/1024.
# Out[17]: 87.70475769042969 MB

# In [15]: df.dtypes
# Out[15]:
# i      int64
# j      int64
# x    float64
# y    float64
# z     object
# dtype: object

# 35M Aug 21 19:24 df.npy
# 37M Aug 21 19:02 df.pkl
# 38M Aug 21 19:03 df.feather
# 38M Sep 25 19:37 df.arrow


# In [8]: %timeit test_numpy_write_npy(df)
# 2.1 s ± 35.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

# In [9]: 35 / 2.1
# Out[9]: 16.666666666666664 MB/s

# In [10]: %timeit test_pickle_write(df)
# 510 ms ± 6.84 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

# In [11]: 37 / .510
# Out[11]: 72.54901960784314 MB/s

# In [4]: %timeit test_feather_write(df)
# 336 ms ± 17.2 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)

# In [5]: 38 / .336
# Out[5]: 113.09523809523809 MB/s

# In [6]: %timeit test_arrow_stream_write(df)
# 254 ms ± 34.2 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)

# In [7]: 38 / .254
# Out[7]: 149.60629921259843 MB/s


# In [12]: %timeit test_numpy_read_npy()
# 1.18 s ± 3.58 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

# In [14]: 35 / 1.18
# Out[14]: 29.661016949152543 MB/s

# In [13]: %timeit test_pickle_read()
# 121 ms ± 1.97 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)

# In [15]: 37 / .121
# Out[15]: 305.78512396694214 MB/s

# In [16]: %timeit test_feather_read()
# 67.1 ms ± 2.83 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)

# In [17]: 38 / .0671
# Out[17]: 566.3189269746646 MB/s

# In [18]: %timeit test_arrow_stream_read()
# 70.5 ms ± 788 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)

# In [19]: 38 / .0705
# Out[19]: 539.0070921985816 MB/s


import numpy
import pandas
import pyarrow
import scidbpy
import sys


def read_scidb():
    df = scidbpy.connect().arrays.df.fetch(as_dataframe=True)
    return df

def read_pickle():
    return pandas.read_pickle('df.pkl')

def test_pickle_write(df):
    df.to_pickle('df.pkl')

def test_pickle_read():
    pandas.read_pickle('df.pkl')

def test_feather_write(df):
    df.to_feather('df.feather')

def test_feather_read():
    pandas.read_feather('df.feather')

def test_numpy_write_npy(df):
    numpy.save('df.npy', df.to_records())

def test_numpy_read_npy():
    numpy.load('df.npy')

def test_arrow_stream_write(df):
    batch = pyarrow.RecordBatch.from_pandas(df, preserve_index=False)
    pyarrow.RecordBatchStreamWriter(
        pyarrow.OSFile('df.arrow', 'wb'), batch.schema).write_batch(batch)

def test_arrow_stream_read():
    pyarrow.RecordBatchStreamReader(
        pyarrow.OSFile('df.arrow', 'rb')).read_pandas()
	# AFL% set no fetch; store(apply(build(<x:int64>[i=1:1000; j=1:1000], i + j * 1000), y, 1e6 / double(i + 1), z, string(i)), df); set fetch;
	# Query was executed successfully
	# AFL% summarize(df);
	# {inst,attid} att,count,bytes,chunks,min_count,avg_count,max_count,min_bytes,avg_bytes,max_bytes
	# {0,0} 'all',1000000,8041133,4,1000000,1e+06,1000000,48,2.01028e+06,8000072


	# In [17]: sys.getsizeof(df)/1024/1024.
	# Out[17]: 87.70475769042969 MB

	# In [15]: df.dtypes
	# Out[15]:
	# i int64
	# j int64
	# x float64
	# y float64
	# z object
	# dtype: object

	# 35M Aug 21 19:24 df.npy
	# 37M Aug 21 19:02 df.pkl
	# 38M Aug 21 19:03 df.feather
	# 38M Sep 25 19:37 df.arrow


	# In [8]: %timeit test_numpy_write_npy(df)
	# 2.1 s ± 35.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

	# In [9]: 35 / 2.1
	# Out[9]: 16.666666666666664 MB/s

	# In [10]: %timeit test_pickle_write(df)
	# 510 ms ± 6.84 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

	# In [11]: 37 / .510
	# Out[11]: 72.54901960784314 MB/s

	# In [4]: %timeit test_feather_write(df)
	# 336 ms ± 17.2 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)

	# In [5]: 38 / .336
	# Out[5]: 113.09523809523809 MB/s

	# In [6]: %timeit test_arrow_stream_write(df)
	# 254 ms ± 34.2 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)

	# In [7]: 38 / .254
	# Out[7]: 149.60629921259843 MB/s


	# In [12]: %timeit test_numpy_read_npy()
	# 1.18 s ± 3.58 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

	# In [14]: 35 / 1.18
	# Out[14]: 29.661016949152543 MB/s

	# In [13]: %timeit test_pickle_read()
	# 121 ms ± 1.97 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)

	# In [15]: 37 / .121
	# Out[15]: 305.78512396694214 MB/s

	# In [16]: %timeit test_feather_read()
	# 67.1 ms ± 2.83 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)

	# In [17]: 38 / .0671
	# Out[17]: 566.3189269746646 MB/s

	# In [18]: %timeit test_arrow_stream_read()
	# 70.5 ms ± 788 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)

	# In [19]: 38 / .0705
	# Out[19]: 539.0070921985816 MB/s


	import numpy
	import pandas
	import pyarrow
	import scidbpy
	import sys


	def read_scidb():
	df = scidbpy.connect().arrays.df.fetch(as_dataframe=True)
	return df

	def read_pickle():
	return pandas.read_pickle('df.pkl')

	def test_pickle_write(df):
	df.to_pickle('df.pkl')

	def test_pickle_read():
	pandas.read_pickle('df.pkl')

	def test_feather_write(df):
	df.to_feather('df.feather')

	def test_feather_read():
	pandas.read_feather('df.feather')

	def test_numpy_write_npy(df):
	numpy.save('df.npy', df.to_records())

	def test_numpy_read_npy():
	numpy.load('df.npy')

	def test_arrow_stream_write(df):
	batch = pyarrow.RecordBatch.from_pandas(df, preserve_index=False)
	pyarrow.RecordBatchStreamWriter(
	pyarrow.OSFile('df.arrow', 'wb'), batch.schema).write_batch(batch)

	def test_arrow_stream_read():
	pyarrow.RecordBatchStreamReader(
	pyarrow.OSFile('df.arrow', 'rb')).read_pandas()