Skip to content

Instantly share code, notes, and snippets.

@ogrisel
Last active April 20, 2018 09:06
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ogrisel/0e7b3282c84ae4a581f3b9ec1d84b45a to your computer and use it in GitHub Desktop.
Save ogrisel/0e7b3282c84ae4a581f3b9ec1d84b45a to your computer and use it in GitHub Desktop.
Memory profiling for Python pickling of large buffers
from pickle import Pickler, _Pickler, Unpickler, _Unpickler, HIGHEST_PROTOCOL
import os
import time
import sys
import gc
from multiprocessing import get_context
PROTOCOL = HIGHEST_PROTOCOL
ctx = get_context('spawn')
out_filename = 'output.pkl'
if '--use-pypickle' in sys.argv:
PicklerFactory = _Pickler
UnpicklerFactory = _Unpickler
else:
PicklerFactory = Pickler
UnpicklerFactory = Unpickler
def monitor_worker(pid, queue, stop_event, delay=0.05):
from psutil import Process
p = Process(pid)
peak = 0
def make_measurement(peak):
mem = p.memory_info().rss
if mem > peak:
peak = mem
return peak
# Make measurements every 'delay' seconds until we receive the stop event:
while not stop_event.wait(timeout=delay):
peak = make_measurement(peak)
# Make one last measurement in case memory has increased just before
# receiving the stop event:
peak = make_measurement(peak)
queue.put(peak)
class PeakMemoryMonitor:
def __enter__(self):
pid = os.getpid()
self.queue = q = ctx.Queue()
self.stop_event = e = ctx.Event()
self.worker = ctx.Process(target=monitor_worker, args=(pid, q, e))
self.worker.start()
return self
def __exit__(self, exc_type, exc_value, tb):
self.stop_event.set()
if exc_type is not None:
self.worker.terminate()
return False
else:
self.peak = self.queue.get()
print("=> peak memory usage: {:.3f} GB".format(self.peak / 1e9))
return True
if __name__ == "__main__":
size = int(2e9)
print('Allocating source data...')
with PeakMemoryMonitor():
#data = '0' * size
data = bytearray(size)
print('Dumping to disk...')
with PeakMemoryMonitor():
t0 = time.time()
with open(out_filename, 'wb') as f:
p = PicklerFactory(f, protocol=PROTOCOL)
p.dump(data)
print('done in {:0.3f}s'.format(time.time() - t0))
del data, p
gc.collect()
print('Loading back from disk...')
with PeakMemoryMonitor():
t0 = time.time()
with open(out_filename, 'rb') as f:
data = UnpicklerFactory(f).load()
print('done in {:0.3f}s'.format(time.time() - t0))
print('Checking data...')
assert len(data) == size
print('ok')
@ogrisel
Copy link
Author

ogrisel commented Nov 9, 2017

Here is the output when on python master (C pickler and Python pickler):

(py37) ogrisel@ici:~/code/cpython$ git checkout master
Already on 'master'
Your branch is up-to-date with 'origin/master'.
(py37) ogrisel@ici:~/code/cpython$ python ~/tmp/large_pickle_dump.py
Allocating source data...
=> peak memory usage: 2.014 GB
Dumping to disk...
done in 5.141s
=> peak memory usage: 4.014 GB
(py37) ogrisel@ici:~/code/cpython$ python ~/tmp/large_pickle_dump.py --use-pypickle
Allocating source data...
=> peak memory usage: 2.014 GB
Dumping to disk...
done in 5.046s
=> peak memory usage: 5.955 GB

Here the output with the Python pickler fixed in python/cpython#4353:

(py37) ogrisel@ici:~/code/cpython$ git checkout issue-31993-pypickle-dump-mem-optim 
Switched to branch 'issue-31993-pypickle-dump-mem-optim'
(py37) ogrisel@ici:~/code/cpython$ python ~/tmp/large_pickle_dump.py --use-pypickle
Allocating source data...
=> peak memory usage: 2.014 GB
Dumping to disk...
done in 4.238s
=> peak memory usage: 2.014 GB

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment