joonas-yoon/benchmark.py

## benchmark.py
import time
import json_as_db as jad
import pandas as pd
from tqdm import tqdm

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)


def measure(func):
    """
    Returns milliseconds how much time takes to run given func
    """
    start = time.time()
    func()
    end = time.time()
    diff = end - start
    return diff * 1000


def bench(func, k: int = 1, params: list = [{}], title: str = ''):
    pcnt = len(params)
    pstr = f" x {pcnt} params" if pcnt > 1 else ''
    desc = f"[run x {k}{pstr}] {title}"
    t = []
    pbar = tqdm(total=k * pcnt, desc=desc)
    for _ in range(k):
        for param in params:
            elapsed = measure(lambda: func(**param))
            t.append(elapsed)
            pbar.update(1)
    pbar.close()
    average = sum(t) / len(t)
    del t
    print(f'>>> (avg.) {average:.4f} ms, (total) {sum(t):.2f} ms')


# File was downloaded from https://json-generator.com/
file_name = 'generated_10k.json'

# -------------------
# Loads from file
# -------------------


def jad_load_json(): return jad.Database().load(file_name)
def pd_load_json(): return pd.read_json(file_name)


bench(
    k=1000,
    title=f"jad.Database().load('{file_name}')",
    func=jad_load_json,
)
bench(
    k=1000,
    title=f"pd.read_json('{file_name}')",
    func=pd_load_json,
)

# -------------------
# Appending
# -------------------

with open('generated_10k.json', 'r') as fp:
    import json
    pool: list = json.load(fp)


def pd_appends():
    df = pd.DataFrame([pool[0]])
    for i in range(1, 1000):
        df.append(pool[i], ignore_index=True)
    del df


def jad_appends():
    db = jad.Database()
    for i in range(0, 1000):
        db.add(pool[i])
    del db


bench(
    k=1000,
    title=f"appending test with `jad.Database.add` one by one",
    func=jad_appends,
)
bench(
    k=50,
    title=f"appending test with `pd.append` one by one",
    func=pd_appends,
)

# -------------------
# Searching
# -------------------

df = pd_load_json()
db = jad_load_json()


def pd_search(i: int):
    item = df[df['index'] > i]
    del item


def jad_search(i: int):
    ids = db.find(jad.Key('index') > i)
    del ids


bench(
    k=10,
    title=f"Searching items with `db.find(jad.Key('index') > i)` for i: 0 -> 10000",
    func=jad_search,
    params=[dict(i=i) for i in range(10000)],
)
bench(
    k=10,
    title=f"Searching items with `df[df['index'] > i]` for i: 0 -> 10000",
    func=pd_search,
    params=[dict(i=i) for i in range(10000)],
)

# -------------------
# Get a single item
# -------------------


def pd_get_single(i: int):
    row = df.iloc[i]


def jad_get_single(id: str):
    row = db.get(id)


bench(
    k=100,
    title=f"Get single item by `jad.Database.get(id)`",
    func=jad_get_single,
    params=[dict(id=id) for id in list(db.keys())],
)
bench(
    k=100,
    title=f"Get single item by `pd.DataFrame.iloc[i]`",
    func=pd_get_single,
    params=[dict(i=i) for i in range(len(df))],
)

# -------------------
# Update item
# -------------------

TARGET_STR = 'foobar'


def pd_update(i: int):
    df.at[i, 'about'] = TARGET_STR


def jad_update(id: str):
    db.modify(id, {'about': TARGET_STR})

# bench(
#     k=100,
#     title=f"Update single item by `db.modify(id, mapping)`",
#     func=jad_update,
#     params=[dict(id=id) for id in list(db.keys())],
# )
# bench(
#     k=100,
#     title=f"Update single item by `df.at[i, col] = value`",
#     func=pd_update,
#     params=[dict(i=i) for i in range(len(df))],
# )

# -------------------
# Update items
# -------------------


TARGET_STR = 'foobar'


def pd_updates(i: int):
    df.loc[i:i+5, ['about']] = TARGET_STR


def jad_updates(id: list):
    values = [{'about': TARGET_STR} for _ in id]
    db.modify(id, values)


db_keys = list(db.keys())

# bench(
#     k=100,
#     title=f"Update mutiple items by `db.modify(ids, mappings)`",
#     func=jad_updates,
#     params=[dict(id=db_keys[i:i+5]) for i in range(len(db_keys)-5)],
# )
# bench(
#     k=100,
#     title=f"Update mutiple items by `df.loc[i:i+5, [col]] = value`",
#     func=pd_updates,
#     params=[dict(i=i) for i in range(len(df)-5)],
# )

"""
Output
------
[run x 1000] jad.Database().load('generated_10k.json')
>>> (avg.) 149.11810 ms
[run x 1000] pd.read_json('generated_10k.json')
>>> (avg.) 153.71676 ms
[run x 1000] appending test with `jad.Database.add` one by one
>>> (avg.) 8.96103 ms
[run x 500] appending test with `pd.concat` one by one
>>> (avg.) 2760.27654 ms
[run] Searching items with `db.find(jad.Key('index') > i)` for i: 0 -> 10000
>>> (avg.) 9.87914 ms
[run] Searching items with `df[df['index'] > i]` for i: 0 -> 10000
>>> (avg.) 2.59354 ms
[run x 100 x 10000 params] Get single item from `jad.Database.get(id)`
>>> (avg.) 0.0039 ms
[run x 100 x 10000 params] Get single item from `pd.DataFrame.iloc[i]`
>>> (avg.) 0.0689 ms
[run x 100 x 10000 params] Update single item by `db.modify(id, mapping)`: 100%|██████████████████████████| 1000000/1000000 [00:08<00:00, 119453.14it/s]
>>> (avg.) 0.0074 ms
[run x 100 x 10000 params] Update single item by `df.at[i, col] = value`: 100%|████████████████████████████| 1000000/1000000 [00:16<00:00, 61751.81it/s]
>>> (avg.) 0.0148 ms
[run x 100 x 9995 params] Update mutiple items by `db.modify(ids, mappings)`: 100%|██████████████████████████| 999500/999500 [00:14<00:00, 69848.90it/s]
>>> (avg.) 0.0130 ms
[run x 100 x 9995 params] Update mutiple items by `df.loc[i:i+5, [col]] = value`: 100%|███████████████████████| 999500/999500 [15:52<00:00, 1049.22it/s]
>>> (avg.) 0.9432 ms
"""
	import time
	import json_as_db as jad
	import pandas as pd
	from tqdm import tqdm

	import warnings
	warnings.simplefilter(action='ignore', category=FutureWarning)


	def measure(func):
	"""
	Returns milliseconds how much time takes to run given func
	"""
	start = time.time()
	func()
	end = time.time()
	diff = end - start
	return diff * 1000


	def bench(func, k: int = 1, params: list = [{}], title: str = ''):
	pcnt = len(params)
	pstr = f" x {pcnt} params" if pcnt > 1 else ''
	desc = f"[run x {k}{pstr}] {title}"
	t = []
	pbar = tqdm(total=k * pcnt, desc=desc)
	for _ in range(k):
	for param in params:
	elapsed = measure(lambda: func(**param))
	t.append(elapsed)
	pbar.update(1)
	pbar.close()
	average = sum(t) / len(t)
	del t
	print(f'>>> (avg.) {average:.4f} ms, (total) {sum(t):.2f} ms')


	# File was downloaded from https://json-generator.com/
	file_name = 'generated_10k.json'

	# -------------------
	# Loads from file
	# -------------------


	def jad_load_json(): return jad.Database().load(file_name)
	def pd_load_json(): return pd.read_json(file_name)


	bench(
	k=1000,
	title=f"jad.Database().load('{file_name}')",
	func=jad_load_json,
	)
	bench(
	k=1000,
	title=f"pd.read_json('{file_name}')",
	func=pd_load_json,
	)

	# -------------------
	# Appending
	# -------------------

	with open('generated_10k.json', 'r') as fp:
	import json
	pool: list = json.load(fp)


	def pd_appends():
	df = pd.DataFrame([pool[0]])
	for i in range(1, 1000):
	df.append(pool[i], ignore_index=True)
	del df


	def jad_appends():
	db = jad.Database()
	for i in range(0, 1000):
	db.add(pool[i])
	del db


	bench(
	k=1000,
	title=f"appending test with `jad.Database.add` one by one",
	func=jad_appends,
	)
	bench(
	k=50,
	title=f"appending test with `pd.append` one by one",
	func=pd_appends,
	)

	# -------------------
	# Searching
	# -------------------

	df = pd_load_json()
	db = jad_load_json()


	def pd_search(i: int):
	item = df[df['index'] > i]
	del item


	def jad_search(i: int):
	ids = db.find(jad.Key('index') > i)
	del ids


	bench(
	k=10,
	title=f"Searching items with `db.find(jad.Key('index') > i)` for i: 0 -> 10000",
	func=jad_search,
	params=[dict(i=i) for i in range(10000)],
	)
	bench(
	k=10,
	title=f"Searching items with `df[df['index'] > i]` for i: 0 -> 10000",
	func=pd_search,
	params=[dict(i=i) for i in range(10000)],
	)

	# -------------------
	# Get a single item
	# -------------------


	def pd_get_single(i: int):
	row = df.iloc[i]


	def jad_get_single(id: str):
	row = db.get(id)


	bench(
	k=100,
	title=f"Get single item by `jad.Database.get(id)`",
	func=jad_get_single,
	params=[dict(id=id) for id in list(db.keys())],
	)
	bench(
	k=100,
	title=f"Get single item by `pd.DataFrame.iloc[i]`",
	func=pd_get_single,
	params=[dict(i=i) for i in range(len(df))],
	)

	# -------------------
	# Update item
	# -------------------

	TARGET_STR = 'foobar'


	def pd_update(i: int):
	df.at[i, 'about'] = TARGET_STR


	def jad_update(id: str):
	db.modify(id, {'about': TARGET_STR})

	# bench(
	# k=100,
	# title=f"Update single item by `db.modify(id, mapping)`",
	# func=jad_update,
	# params=[dict(id=id) for id in list(db.keys())],
	# )
	# bench(
	# k=100,
	# title=f"Update single item by `df.at[i, col] = value`",
	# func=pd_update,
	# params=[dict(i=i) for i in range(len(df))],
	# )

	# -------------------
	# Update items
	# -------------------


	TARGET_STR = 'foobar'


	def pd_updates(i: int):
	df.loc[i:i+5, ['about']] = TARGET_STR


	def jad_updates(id: list):
	values = [{'about': TARGET_STR} for _ in id]
	db.modify(id, values)


	db_keys = list(db.keys())

	# bench(
	# k=100,
	# title=f"Update mutiple items by `db.modify(ids, mappings)`",
	# func=jad_updates,
	# params=[dict(id=db_keys[i:i+5]) for i in range(len(db_keys)-5)],
	# )
	# bench(
	# k=100,
	# title=f"Update mutiple items by `df.loc[i:i+5, [col]] = value`",
	# func=pd_updates,
	# params=[dict(i=i) for i in range(len(df)-5)],
	# )

	"""
	Output
	------
	[run x 1000] jad.Database().load('generated_10k.json')
	>>> (avg.) 149.11810 ms
	[run x 1000] pd.read_json('generated_10k.json')
	>>> (avg.) 153.71676 ms
	[run x 1000] appending test with `jad.Database.add` one by one
	>>> (avg.) 8.96103 ms
	[run x 500] appending test with `pd.concat` one by one
	>>> (avg.) 2760.27654 ms
	[run] Searching items with `db.find(jad.Key('index') > i)` for i: 0 -> 10000
	>>> (avg.) 9.87914 ms
	[run] Searching items with `df[df['index'] > i]` for i: 0 -> 10000
	>>> (avg.) 2.59354 ms
	[run x 100 x 10000 params] Get single item from `jad.Database.get(id)`
	>>> (avg.) 0.0039 ms
	[run x 100 x 10000 params] Get single item from `pd.DataFrame.iloc[i]`
	>>> (avg.) 0.0689 ms
	[run x 100 x 10000 params] Update single item by `db.modify(id, mapping)`: 100%\|██████████████████████████\| 1000000/1000000 [00:08<00:00, 119453.14it/s]
	>>> (avg.) 0.0074 ms
	[run x 100 x 10000 params] Update single item by `df.at[i, col] = value`: 100%\|████████████████████████████\| 1000000/1000000 [00:16<00:00, 61751.81it/s]
	>>> (avg.) 0.0148 ms
	[run x 100 x 9995 params] Update mutiple items by `db.modify(ids, mappings)`: 100%\|██████████████████████████\| 999500/999500 [00:14<00:00, 69848.90it/s]
	>>> (avg.) 0.0130 ms
	[run x 100 x 9995 params] Update mutiple items by `df.loc[i:i+5, [col]] = value`: 100%\|███████████████████████\| 999500/999500 [15:52<00:00, 1049.22it/s]
	>>> (avg.) 0.9432 ms
	"""
(avg. time per operation with 10K items)	json_as_db	pandas
Loads from file	`149.11810 ms`	`153.71676 ms`
Append items	`8.96103 ms`	`2760.27654 ms`
Search a item	`9.87914 ms`	`2.59354 ms`
Get an item by key	`0.0039 ms`	`0.0689 ms`
Updating a item	`0.0074 ms`	`0.0148 ms`
Updating 5 items in a row	`0.0130 ms`	`0.9432 ms`
Remove an item	`0.0012 ms`	`6.0930 ms`