Skip to content

Instantly share code, notes, and snippets.

@pitrou
Created October 24, 2023 14:57
Show Gist options
  • Save pitrou/1713b6d32111dac9102e87be1b5ac887 to your computer and use it in GitHub Desktop.
Save pitrou/1713b6d32111dac9102e87be1b5ac887 to your computer and use it in GitHub Desktop.
import pyarrow as pa
import pyarrow.parquet as pq
import numpy as np
import pandas as pd
import time
import io
# Create datasets
def write_datasets():
x = np.random.randint(0, 100000, size=(1000000, 100))
df = pd.DataFrame(x)
t = pa.Table.from_pandas(df)
kwargs = dict()
#kwargs = dict(dictionary_pagesize_limit=16<<10)
kwargs = dict(use_dictionary=False, column_encoding="DELTA_BINARY_PACKED")
#kwargs = dict(use_dictionary=False, column_encoding="PLAIN")
pq.write_table(t, "foo.parquet", **kwargs)
pq.write_table(t, "foo-lz4.parquet", compression="lz4", **kwargs)
pq.write_table(t, "foo-uncompressed.parquet", compression="none", **kwargs)
def run_benchmark(fname):
niterations = 6
with open(fname, "rb") as f:
bytes = f.read()
# Time Arrow Parquet Speeds
start = time.perf_counter()
for _ in range(niterations):
table = pq.read_table(fname, use_threads=False)
stop = time.perf_counter()
print(f"PyArrow Read Bandwidth for {fname!r}:", int(table.nbytes / ((stop - start) / niterations) / 2**20), "MiB/s")
# Time In-Memory Read Speeds
start = time.perf_counter()
for _ in range(niterations):
pq.read_table(pa.py_buffer(bytes), use_threads=False)
stop = time.perf_counter()
print(f"PyArrow In-Memory Bandwidth for {fname!r}:", int(table.nbytes / ((stop - start) / niterations) / 2**20), "MiB/s")
write_datasets()
run_benchmark("foo.parquet")
run_benchmark("foo-lz4.parquet")
run_benchmark("foo-uncompressed.parquet")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment