Skip to content

Instantly share code, notes, and snippets.

@wesm
Created October 19, 2019 17:02
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save wesm/c75ad3b6dcd37231aaacf56a80a5e401 to your computer and use it in GitHub Desktop.
Save wesm/c75ad3b6dcd37231aaacf56a80a5e401 to your computer and use it in GitHub Desktop.
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import time
import gc
import psutil
PROC = psutil.Process()
def get_rss():
return PROC.memory_info().rss
RSS_TELEMETRY = []
class memory_use:
def __init__(self):
self.start_use = pa.total_allocated_bytes()
self.start_rss = get_rss()
self.pool = pa.default_memory_pool()
self.start_peak_use = self.pool.max_memory()
def __enter__(self):
return
def __exit__(self, type, value, traceback):
gc.collect()
rss = get_rss()
print("RSS: {}, change: {}"
.format(rss, rss - self.start_rss))
RSS_TELEMETRY.append(rss)
# print("Change in Arrow allocations: {}"
# .format(pa.total_allocated_bytes() - self.start_use))
# print("Change in peak use: {}"
# .format(self.pool.max_memory() - self.start_peak_use))
def write_fec():
fec = pd.read_csv('/home/wesm/code/pydata-book/datasets/fec/P00000001-ALL.csv')
def coerce_int(x):
try:
return int(x)
except Exception:
return -1
fec['contbr_zip'] = fec['contbr_zip'].map(coerce_int).astype(np.int64)
fec = pa.concat_tables([pa.table(fec)] * 10)
pq.write_table(fec, path)
# path = '/home/wesm/tmp/fec.parquet'
path = '/home/wesm/Downloads/big.snappy.parquet'
pa.jemalloc_set_decay_ms(10000)
start = time.time()
for i in range(10):
with memory_use():
pq.read_table(path, memory_map=False)
elapsed = time.time() - start
print(f"Took {elapsed} seconds")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment