Skip to content

Instantly share code, notes, and snippets.

@qiuwei
Created December 11, 2020 03:10
Show Gist options
  • Save qiuwei/e1118d19692412a94f53abdb0536c441 to your computer and use it in GitHub Desktop.
Save qiuwei/e1118d19692412a94f53abdb0536c441 to your computer and use it in GitHub Desktop.
Benchmark read and write speed of TileDb
#%%
import glob
import os
import datetime
import tiledb
import pandas as pd
from tiledb.dataframe_ import _tiledb_result_as_dataframe
from tqdm import tqdm
import time
import numpy as np
cfg = tiledb.Ctx().config()
cfg.update(
{
'py.init_buffer_bytes': 1024**2 * 50 # 50MB per attribute
}
)
tiledb.default_ctx(cfg)
#%%
def load_data(data_path = "eodprices.parquet"):
ts = time.time()
return pd.read_parquet(data_path)
te = time.time()
print(f"loading data takes {te-ts}s")
#%%
def write_tiledb(df, path, tile=1024**2):
t0 = time.time()
tiledb.from_pandas(path, df, sparse=False, tile=tile)
t1 = time.time()
print(f"Saving to tiledb takes {t1 - t0}")
def read_tiledb(path):
t0 = time.time()
df = tiledb.open_dataframe(path)
t1 = time.time()
print(f"reading from tiledb takes {t1 - t0}")
return df
#%%
def read_single_column(attrs, path, ctx=None):
t0 = time.time()
if ctx is None:
ctx = tiledb.default_ctx()
# TODO support `distributed=True` option?
with tiledb.open(path, ctx=ctx) as A:
nonempty = A.nonempty_domain()
data = A.query(attrs).multi_index.__getitem__(tuple(slice(s1, s2) for s1,s2 in nonempty))
new_df = _tiledb_result_as_dataframe(A, data)
t1 = time.time()
print(f"reading single column form tiledb takes {t1-t0}s")
return new_df
#%%
def read_slicing(uri=):
with tiledb.open(uri) as A:
# q = A.query(attrs=('S_FA_ROA',))
# indexing the Query object will only retrieve the
# selected attribute(s)
q = A.query()
data = q[np.datetime64('2005-02-25'):np.datetime64('2010-11-03'), :]
new_df = _tiledb_result_as_dataframe(A,data)
return new_df
#%%
# benchmark 1
df = load_data()
write_tiledb(df, "benchmark1.tdb")
read_tiledb("benchmark1.tdb")
read_single_column(["S_DQ_AVGPRICE"],"benchmark1.tdb")
@Neelaksh-Singh
Copy link

Hi could you also provide the code for parquet benchmark test.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment