-
-
Save wesm/c75ad3b6dcd37231aaacf56a80a5e401 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import pandas as pd | |
import pyarrow as pa | |
import pyarrow.parquet as pq | |
import time | |
import gc | |
import psutil | |
PROC = psutil.Process() | |
def get_rss(): | |
return PROC.memory_info().rss | |
RSS_TELEMETRY = [] | |
class memory_use: | |
def __init__(self): | |
self.start_use = pa.total_allocated_bytes() | |
self.start_rss = get_rss() | |
self.pool = pa.default_memory_pool() | |
self.start_peak_use = self.pool.max_memory() | |
def __enter__(self): | |
return | |
def __exit__(self, type, value, traceback): | |
gc.collect() | |
rss = get_rss() | |
print("RSS: {}, change: {}" | |
.format(rss, rss - self.start_rss)) | |
RSS_TELEMETRY.append(rss) | |
# print("Change in Arrow allocations: {}" | |
# .format(pa.total_allocated_bytes() - self.start_use)) | |
# print("Change in peak use: {}" | |
# .format(self.pool.max_memory() - self.start_peak_use)) | |
def write_fec(): | |
fec = pd.read_csv('/home/wesm/code/pydata-book/datasets/fec/P00000001-ALL.csv') | |
def coerce_int(x): | |
try: | |
return int(x) | |
except Exception: | |
return -1 | |
fec['contbr_zip'] = fec['contbr_zip'].map(coerce_int).astype(np.int64) | |
fec = pa.concat_tables([pa.table(fec)] * 10) | |
pq.write_table(fec, path) | |
# path = '/home/wesm/tmp/fec.parquet' | |
path = '/home/wesm/Downloads/big.snappy.parquet' | |
pa.jemalloc_set_decay_ms(10000) | |
start = time.time() | |
for i in range(10): | |
with memory_use(): | |
pq.read_table(path, memory_map=False) | |
elapsed = time.time() - start | |
print(f"Took {elapsed} seconds") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment