Last active
January 30, 2023 08:01
-
-
Save joonas-yoon/7a67039b90d0554b26a5abdcba23343b to your computer and use it in GitHub Desktop.
jad vs. pandas
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import time | |
import json_as_db as jad | |
import pandas as pd | |
from tqdm import tqdm | |
import warnings | |
warnings.simplefilter(action='ignore', category=FutureWarning) | |
def measure(func): | |
""" | |
Returns milliseconds how much time takes to run given func | |
""" | |
start = time.time() | |
func() | |
end = time.time() | |
diff = end - start | |
return diff * 1000 | |
def bench(func, k: int = 1, params: list = [{}], title: str = ''): | |
pcnt = len(params) | |
pstr = f" x {pcnt} params" if pcnt > 1 else '' | |
desc = f"[run x {k}{pstr}] {title}" | |
t = [] | |
pbar = tqdm(total=k * pcnt, desc=desc) | |
for _ in range(k): | |
for param in params: | |
elapsed = measure(lambda: func(**param)) | |
t.append(elapsed) | |
pbar.update(1) | |
pbar.close() | |
average = sum(t) / len(t) | |
del t | |
print(f'>>> (avg.) {average:.4f} ms, (total) {sum(t):.2f} ms') | |
# File was downloaded from https://json-generator.com/ | |
file_name = 'generated_10k.json' | |
# ------------------- | |
# Loads from file | |
# ------------------- | |
def jad_load_json(): return jad.Database().load(file_name) | |
def pd_load_json(): return pd.read_json(file_name) | |
bench( | |
k=1000, | |
title=f"jad.Database().load('{file_name}')", | |
func=jad_load_json, | |
) | |
bench( | |
k=1000, | |
title=f"pd.read_json('{file_name}')", | |
func=pd_load_json, | |
) | |
# ------------------- | |
# Appending | |
# ------------------- | |
with open('generated_10k.json', 'r') as fp: | |
import json | |
pool: list = json.load(fp) | |
def pd_appends(): | |
df = pd.DataFrame([pool[0]]) | |
for i in range(1, 1000): | |
df.append(pool[i], ignore_index=True) | |
del df | |
def jad_appends(): | |
db = jad.Database() | |
for i in range(0, 1000): | |
db.add(pool[i]) | |
del db | |
bench( | |
k=1000, | |
title=f"appending test with `jad.Database.add` one by one", | |
func=jad_appends, | |
) | |
bench( | |
k=50, | |
title=f"appending test with `pd.append` one by one", | |
func=pd_appends, | |
) | |
# ------------------- | |
# Searching | |
# ------------------- | |
df = pd_load_json() | |
db = jad_load_json() | |
def pd_search(i: int): | |
item = df[df['index'] > i] | |
del item | |
def jad_search(i: int): | |
ids = db.find(jad.Key('index') > i) | |
del ids | |
bench( | |
k=10, | |
title=f"Searching items with `db.find(jad.Key('index') > i)` for i: 0 -> 10000", | |
func=jad_search, | |
params=[dict(i=i) for i in range(10000)], | |
) | |
bench( | |
k=10, | |
title=f"Searching items with `df[df['index'] > i]` for i: 0 -> 10000", | |
func=pd_search, | |
params=[dict(i=i) for i in range(10000)], | |
) | |
# ------------------- | |
# Get a single item | |
# ------------------- | |
def pd_get_single(i: int): | |
row = df.iloc[i] | |
def jad_get_single(id: str): | |
row = db.get(id) | |
bench( | |
k=100, | |
title=f"Get single item by `jad.Database.get(id)`", | |
func=jad_get_single, | |
params=[dict(id=id) for id in list(db.keys())], | |
) | |
bench( | |
k=100, | |
title=f"Get single item by `pd.DataFrame.iloc[i]`", | |
func=pd_get_single, | |
params=[dict(i=i) for i in range(len(df))], | |
) | |
# ------------------- | |
# Update item | |
# ------------------- | |
TARGET_STR = 'foobar' | |
def pd_update(i: int): | |
df.at[i, 'about'] = TARGET_STR | |
def jad_update(id: str): | |
db.modify(id, {'about': TARGET_STR}) | |
# bench( | |
# k=100, | |
# title=f"Update single item by `db.modify(id, mapping)`", | |
# func=jad_update, | |
# params=[dict(id=id) for id in list(db.keys())], | |
# ) | |
# bench( | |
# k=100, | |
# title=f"Update single item by `df.at[i, col] = value`", | |
# func=pd_update, | |
# params=[dict(i=i) for i in range(len(df))], | |
# ) | |
# ------------------- | |
# Update items | |
# ------------------- | |
TARGET_STR = 'foobar' | |
def pd_updates(i: int): | |
df.loc[i:i+5, ['about']] = TARGET_STR | |
def jad_updates(id: list): | |
values = [{'about': TARGET_STR} for _ in id] | |
db.modify(id, values) | |
db_keys = list(db.keys()) | |
# bench( | |
# k=100, | |
# title=f"Update mutiple items by `db.modify(ids, mappings)`", | |
# func=jad_updates, | |
# params=[dict(id=db_keys[i:i+5]) for i in range(len(db_keys)-5)], | |
# ) | |
# bench( | |
# k=100, | |
# title=f"Update mutiple items by `df.loc[i:i+5, [col]] = value`", | |
# func=pd_updates, | |
# params=[dict(i=i) for i in range(len(df)-5)], | |
# ) | |
""" | |
Output | |
------ | |
[run x 1000] jad.Database().load('generated_10k.json') | |
>>> (avg.) 149.11810 ms | |
[run x 1000] pd.read_json('generated_10k.json') | |
>>> (avg.) 153.71676 ms | |
[run x 1000] appending test with `jad.Database.add` one by one | |
>>> (avg.) 8.96103 ms | |
[run x 500] appending test with `pd.concat` one by one | |
>>> (avg.) 2760.27654 ms | |
[run] Searching items with `db.find(jad.Key('index') > i)` for i: 0 -> 10000 | |
>>> (avg.) 9.87914 ms | |
[run] Searching items with `df[df['index'] > i]` for i: 0 -> 10000 | |
>>> (avg.) 2.59354 ms | |
[run x 100 x 10000 params] Get single item from `jad.Database.get(id)` | |
>>> (avg.) 0.0039 ms | |
[run x 100 x 10000 params] Get single item from `pd.DataFrame.iloc[i]` | |
>>> (avg.) 0.0689 ms | |
[run x 100 x 10000 params] Update single item by `db.modify(id, mapping)`: 100%|██████████████████████████| 1000000/1000000 [00:08<00:00, 119453.14it/s] | |
>>> (avg.) 0.0074 ms | |
[run x 100 x 10000 params] Update single item by `df.at[i, col] = value`: 100%|████████████████████████████| 1000000/1000000 [00:16<00:00, 61751.81it/s] | |
>>> (avg.) 0.0148 ms | |
[run x 100 x 9995 params] Update mutiple items by `db.modify(ids, mappings)`: 100%|██████████████████████████| 999500/999500 [00:14<00:00, 69848.90it/s] | |
>>> (avg.) 0.0130 ms | |
[run x 100 x 9995 params] Update mutiple items by `df.loc[i:i+5, [col]] = value`: 100%|███████████████████████| 999500/999500 [15:52<00:00, 1049.22it/s] | |
>>> (avg.) 0.9432 ms | |
""" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
149.11810 ms
153.71676 ms
8.96103 ms
2760.27654 ms
9.87914 ms
2.59354 ms
0.0039 ms
0.0689 ms
0.0074 ms
0.0148 ms
0.0130 ms
0.9432 ms
0.0012 ms
6.0930 ms