Skip to content

Instantly share code, notes, and snippets.

@laurent-laporte-pro
Last active August 30, 2023 07:58
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save laurent-laporte-pro/ba5c0cd7914fafe30c921f0b80cac1d8 to your computer and use it in GitHub Desktop.
Save laurent-laporte-pro/ba5c0cd7914fafe30c921f0b80cac1d8 to your computer and use it in GitHub Desktop.
2D-NumPy Matrix Benchmark: Measuring Read/Write Times and Memory Usage
import json
import multiprocessing
import sys
import time
import typing as t
import typing_extensions as te
from pathlib import Path
import numpy as np
import pandas as pd
HERE = Path(__file__).parent.resolve()
Shape = t.Tuple[int, int]
FileFormat = te.Literal["csv", "xlsx", "json", "npz"]
StatTable = t.Dict[Shape, t.Dict[FileFormat, float]]
class Stat(t.NamedTuple):
file_format: FileFormat
shape: Shape
save_duration: float
load_duration: float
size: int
def benchmark_excel(array):
xlsx_path = HERE.joinpath(
f"big_excel_matrix_{array.shape[0]:04d}-{array.shape[1]:04d}.xlsx"
)
df = pd.DataFrame(array)
timestamp = time.time()
df.to_excel(xlsx_path, index=False, header=False)
save_duration = time.time() - timestamp
timestamp = time.time()
pd.read_excel(xlsx_path)
load_duration = time.time() - timestamp
return Stat(
file_format="xlsx",
shape=array.shape,
save_duration=save_duration,
load_duration=load_duration,
size=xlsx_path.stat().st_size,
)
def benchmark_json(array):
json_path = HERE.joinpath(
f"big_json_matrix_{array.shape[0]:04d}-{array.shape[1]:04d}.json"
)
obj = {
"index": list(range(8760)),
"columns": list(range(array.shape[1])),
"data": array.tolist(),
}
timestamp = time.time()
with json_path.open(mode="w", encoding="utf-8") as fd:
fd.write(json.dumps(obj, indent=True))
save_duration = time.time() - timestamp
timestamp = time.time()
with json_path.open(mode="r", encoding="utf-8") as fd:
json.loads(fd.read())
load_duration = time.time() - timestamp
return Stat(
file_format="json",
shape=array.shape,
save_duration=save_duration,
load_duration=load_duration,
size=json_path.stat().st_size,
)
def benchmark_csv(array):
csv_path = HERE.joinpath(
f"big_csv_matrix_{array.shape[0]:04d}-{array.shape[1]:04d}.csv"
)
timestamp = time.time()
np.savetxt(csv_path, array, delimiter="\t", fmt="%.6f")
save_duration = time.time() - timestamp
timestamp = time.time()
np.loadtxt(csv_path, delimiter="\t", dtype=np.float64, ndmin=2)
load_duration = time.time() - timestamp
return Stat(
file_format="csv",
shape=array.shape,
save_duration=save_duration,
load_duration=load_duration,
size=csv_path.stat().st_size,
)
def benchmark_npz(array):
npz_path = HERE.joinpath(
f"big_npz_matrix_{array.shape[0]:04d}-{array.shape[1]:04d}.npz"
)
timestamp = time.time()
np.savez(npz_path, array1=array)
save_duration = time.time() - timestamp
timestamp = time.time()
# noinspection PyStatementEffect
np.load(npz_path)["array1"]
load_duration = time.time() - timestamp
return Stat(
file_format="npz",
shape=array.shape,
save_duration=save_duration,
load_duration=load_duration,
size=npz_path.stat().st_size,
)
def _print_table(stat_table: StatTable, *, file=sys.stdout, unit=1):
# extract header
headers = sorted(
{c for shape, values in stat_table.items() for c in values}
)
print("| | " + " | ".join(headers) + " |", file=file)
print(
"|------|-" + ":|-".join("-" * len(h) for h in headers) + ":|",
file=file,
)
for shape, values in sorted(stat_table.items()):
line = [shape] + [
f"{values[header] / unit:0.3f}" for header in headers
]
print("| " + " | ".join(map(str, line)) + " |", file=file)
def create_big_matrix() -> None:
shapes = [
(365, 1),
(365, 100),
(365, 1000),
(8760, 1),
(8760, 10),
(8760, 100),
(8760, 200),
(8760, 1000),
(8760, 2000),
]
save_durations: StatTable = {
s: {"xlsx": 0, "json": 0, "csv": 0, "npz": 0} for s in shapes
}
load_durations: StatTable = {
s: {"xlsx": 0, "json": 0, "csv": 0, "npz": 0} for s in shapes
}
sizes: StatTable = {
s: {"xlsx": 0, "json": 0, "csv": 0, "npz": 0} for s in shapes
}
with multiprocessing.Pool(processes=20) as pool:
results = []
for shape in shapes:
array = np.random.rand(*shape) * 1000
results.append(pool.apply_async(benchmark_csv, (array,)))
results.append(pool.apply_async(benchmark_json, (array,)))
results.append(pool.apply_async(benchmark_excel, (array,)))
results.append(pool.apply_async(benchmark_npz, (array,)))
for res in results:
stat = res.get()
save_durations[stat.shape][stat.file_format] = stat.save_duration
load_durations[stat.shape][stat.file_format] = stat.load_duration
sizes[stat.shape][stat.file_format] = stat.size
report_path = HERE.joinpath(f"report.md")
with report_path.open(mode="w", encoding="utf-8") as report:
print("# Big Matrices Serialization Benchmark", file=report)
print(file=report)
print("## Saving duration (s)", file=report)
print(file=report)
_print_table(save_durations, file=report)
print(file=report)
print("## Loading duration (s)", file=report)
print(file=report)
_print_table(load_durations, file=report)
print(file=report)
print(file=report)
print("## File size (Mo)", file=report)
_print_table(sizes, file=report, unit=1024 * 1024)
if __name__ == "__main__":
create_big_matrix()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment