Skip to content

Instantly share code, notes, and snippets.

@joonas-yoon
Last active January 30, 2023 08:01
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save joonas-yoon/7a67039b90d0554b26a5abdcba23343b to your computer and use it in GitHub Desktop.
Save joonas-yoon/7a67039b90d0554b26a5abdcba23343b to your computer and use it in GitHub Desktop.
jad vs. pandas
import time
import json_as_db as jad
import pandas as pd
from tqdm import tqdm
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
def measure(func):
"""
Returns milliseconds how much time takes to run given func
"""
start = time.time()
func()
end = time.time()
diff = end - start
return diff * 1000
def bench(func, k: int = 1, params: list = [{}], title: str = ''):
pcnt = len(params)
pstr = f" x {pcnt} params" if pcnt > 1 else ''
desc = f"[run x {k}{pstr}] {title}"
t = []
pbar = tqdm(total=k * pcnt, desc=desc)
for _ in range(k):
for param in params:
elapsed = measure(lambda: func(**param))
t.append(elapsed)
pbar.update(1)
pbar.close()
average = sum(t) / len(t)
del t
print(f'>>> (avg.) {average:.4f} ms, (total) {sum(t):.2f} ms')
# File was downloaded from https://json-generator.com/
file_name = 'generated_10k.json'
# -------------------
# Loads from file
# -------------------
def jad_load_json(): return jad.Database().load(file_name)
def pd_load_json(): return pd.read_json(file_name)
bench(
k=1000,
title=f"jad.Database().load('{file_name}')",
func=jad_load_json,
)
bench(
k=1000,
title=f"pd.read_json('{file_name}')",
func=pd_load_json,
)
# -------------------
# Appending
# -------------------
with open('generated_10k.json', 'r') as fp:
import json
pool: list = json.load(fp)
def pd_appends():
df = pd.DataFrame([pool[0]])
for i in range(1, 1000):
df.append(pool[i], ignore_index=True)
del df
def jad_appends():
db = jad.Database()
for i in range(0, 1000):
db.add(pool[i])
del db
bench(
k=1000,
title=f"appending test with `jad.Database.add` one by one",
func=jad_appends,
)
bench(
k=50,
title=f"appending test with `pd.append` one by one",
func=pd_appends,
)
# -------------------
# Searching
# -------------------
df = pd_load_json()
db = jad_load_json()
def pd_search(i: int):
item = df[df['index'] > i]
del item
def jad_search(i: int):
ids = db.find(jad.Key('index') > i)
del ids
bench(
k=10,
title=f"Searching items with `db.find(jad.Key('index') > i)` for i: 0 -> 10000",
func=jad_search,
params=[dict(i=i) for i in range(10000)],
)
bench(
k=10,
title=f"Searching items with `df[df['index'] > i]` for i: 0 -> 10000",
func=pd_search,
params=[dict(i=i) for i in range(10000)],
)
# -------------------
# Get a single item
# -------------------
def pd_get_single(i: int):
row = df.iloc[i]
def jad_get_single(id: str):
row = db.get(id)
bench(
k=100,
title=f"Get single item by `jad.Database.get(id)`",
func=jad_get_single,
params=[dict(id=id) for id in list(db.keys())],
)
bench(
k=100,
title=f"Get single item by `pd.DataFrame.iloc[i]`",
func=pd_get_single,
params=[dict(i=i) for i in range(len(df))],
)
# -------------------
# Update item
# -------------------
TARGET_STR = 'foobar'
def pd_update(i: int):
df.at[i, 'about'] = TARGET_STR
def jad_update(id: str):
db.modify(id, {'about': TARGET_STR})
# bench(
# k=100,
# title=f"Update single item by `db.modify(id, mapping)`",
# func=jad_update,
# params=[dict(id=id) for id in list(db.keys())],
# )
# bench(
# k=100,
# title=f"Update single item by `df.at[i, col] = value`",
# func=pd_update,
# params=[dict(i=i) for i in range(len(df))],
# )
# -------------------
# Update items
# -------------------
TARGET_STR = 'foobar'
def pd_updates(i: int):
df.loc[i:i+5, ['about']] = TARGET_STR
def jad_updates(id: list):
values = [{'about': TARGET_STR} for _ in id]
db.modify(id, values)
db_keys = list(db.keys())
# bench(
# k=100,
# title=f"Update mutiple items by `db.modify(ids, mappings)`",
# func=jad_updates,
# params=[dict(id=db_keys[i:i+5]) for i in range(len(db_keys)-5)],
# )
# bench(
# k=100,
# title=f"Update mutiple items by `df.loc[i:i+5, [col]] = value`",
# func=pd_updates,
# params=[dict(i=i) for i in range(len(df)-5)],
# )
"""
Output
------
[run x 1000] jad.Database().load('generated_10k.json')
>>> (avg.) 149.11810 ms
[run x 1000] pd.read_json('generated_10k.json')
>>> (avg.) 153.71676 ms
[run x 1000] appending test with `jad.Database.add` one by one
>>> (avg.) 8.96103 ms
[run x 500] appending test with `pd.concat` one by one
>>> (avg.) 2760.27654 ms
[run] Searching items with `db.find(jad.Key('index') > i)` for i: 0 -> 10000
>>> (avg.) 9.87914 ms
[run] Searching items with `df[df['index'] > i]` for i: 0 -> 10000
>>> (avg.) 2.59354 ms
[run x 100 x 10000 params] Get single item from `jad.Database.get(id)`
>>> (avg.) 0.0039 ms
[run x 100 x 10000 params] Get single item from `pd.DataFrame.iloc[i]`
>>> (avg.) 0.0689 ms
[run x 100 x 10000 params] Update single item by `db.modify(id, mapping)`: 100%|██████████████████████████| 1000000/1000000 [00:08<00:00, 119453.14it/s]
>>> (avg.) 0.0074 ms
[run x 100 x 10000 params] Update single item by `df.at[i, col] = value`: 100%|████████████████████████████| 1000000/1000000 [00:16<00:00, 61751.81it/s]
>>> (avg.) 0.0148 ms
[run x 100 x 9995 params] Update mutiple items by `db.modify(ids, mappings)`: 100%|██████████████████████████| 999500/999500 [00:14<00:00, 69848.90it/s]
>>> (avg.) 0.0130 ms
[run x 100 x 9995 params] Update mutiple items by `df.loc[i:i+5, [col]] = value`: 100%|███████████████████████| 999500/999500 [15:52<00:00, 1049.22it/s]
>>> (avg.) 0.9432 ms
"""
@joonas-yoon
Copy link
Author

joonas-yoon commented Jan 27, 2023

(avg. time per operation with 10K items) json_as_db pandas
Loads from file 149.11810 ms 153.71676 ms
Append items 8.96103 ms 2760.27654 ms
Search a item 9.87914 ms 2.59354 ms
Get an item by key 0.0039 ms 0.0689 ms
Updating a item 0.0074 ms 0.0148 ms
Updating 5 items in a row 0.0130 ms 0.9432 ms
Remove an item 0.0012 ms 6.0930 ms

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment