Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Benchmark read and write speed of TileDb
#%%
import glob
import os
import datetime
import tiledb
import pandas as pd
from tiledb.dataframe_ import _tiledb_result_as_dataframe
from tqdm import tqdm
import time
import numpy as np
cfg = tiledb.Ctx().config()
cfg.update(
{
'py.init_buffer_bytes': 1024**2 * 50 # 50MB per attribute
}
)
tiledb.default_ctx(cfg)
#%%
def load_data(data_path = "eodprices.parquet"):
ts = time.time()
return pd.read_parquet(data_path)
te = time.time()
print(f"loading data takes {te-ts}s")
#%%
def write_tiledb(df, path, tile=1024**2):
t0 = time.time()
tiledb.from_pandas(path, df, sparse=False, tile=tile)
t1 = time.time()
print(f"Saving to tiledb takes {t1 - t0}")
def read_tiledb(path):
t0 = time.time()
df = tiledb.open_dataframe(path)
t1 = time.time()
print(f"reading from tiledb takes {t1 - t0}")
return df
#%%
def read_single_column(attrs, path, ctx=None):
t0 = time.time()
if ctx is None:
ctx = tiledb.default_ctx()
# TODO support `distributed=True` option?
with tiledb.open(path, ctx=ctx) as A:
nonempty = A.nonempty_domain()
data = A.query(attrs).multi_index.__getitem__(tuple(slice(s1, s2) for s1,s2 in nonempty))
new_df = _tiledb_result_as_dataframe(A, data)
t1 = time.time()
print(f"reading single column form tiledb takes {t1-t0}s")
return new_df
#%%
def read_slicing(uri=):
with tiledb.open(uri) as A:
# q = A.query(attrs=('S_FA_ROA',))
# indexing the Query object will only retrieve the
# selected attribute(s)
q = A.query()
data = q[np.datetime64('2005-02-25'):np.datetime64('2010-11-03'), :]
new_df = _tiledb_result_as_dataframe(A,data)
return new_df
#%%
# benchmark 1
df = load_data()
write_tiledb(df, "benchmark1.tdb")
read_tiledb("benchmark1.tdb")
read_single_column(["S_DQ_AVGPRICE"],"benchmark1.tdb")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment