Skip to content

Instantly share code, notes, and snippets.

What would you like to do?
Benchmark read and write speed of TileDb
import glob
import os
import datetime
import tiledb
import pandas as pd
from tiledb.dataframe_ import _tiledb_result_as_dataframe
from tqdm import tqdm
import time
import numpy as np
cfg = tiledb.Ctx().config()
'py.init_buffer_bytes': 1024**2 * 50 # 50MB per attribute
def load_data(data_path = "eodprices.parquet"):
ts = time.time()
return pd.read_parquet(data_path)
te = time.time()
print(f"loading data takes {te-ts}s")
def write_tiledb(df, path, tile=1024**2):
t0 = time.time()
tiledb.from_pandas(path, df, sparse=False, tile=tile)
t1 = time.time()
print(f"Saving to tiledb takes {t1 - t0}")
def read_tiledb(path):
t0 = time.time()
df = tiledb.open_dataframe(path)
t1 = time.time()
print(f"reading from tiledb takes {t1 - t0}")
return df
def read_single_column(attrs, path, ctx=None):
t0 = time.time()
if ctx is None:
ctx = tiledb.default_ctx()
# TODO support `distributed=True` option?
with, ctx=ctx) as A:
nonempty = A.nonempty_domain()
data = A.query(attrs).multi_index.__getitem__(tuple(slice(s1, s2) for s1,s2 in nonempty))
new_df = _tiledb_result_as_dataframe(A, data)
t1 = time.time()
print(f"reading single column form tiledb takes {t1-t0}s")
return new_df
def read_slicing(uri=):
with as A:
# q = A.query(attrs=('S_FA_ROA',))
# indexing the Query object will only retrieve the
# selected attribute(s)
q = A.query()
data = q[np.datetime64('2005-02-25'):np.datetime64('2010-11-03'), :]
new_df = _tiledb_result_as_dataframe(A,data)
return new_df
# benchmark 1
df = load_data()
write_tiledb(df, "benchmark1.tdb")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment