Create a gist now

Instantly share code, notes, and snippets.

@amyangfei /chksum.py
Last active Aug 29, 2015

What would you like to do?
memory profile for calculating md5 checksum with two methods(separating chunks reading and all in memory)
#!/usr/bin/env python
# coding: utf-8
import hashlib
import time
import functools
from memory_profiler import memory_usage
def print_timing(func):
@functools.wraps(func)
def wrapper(*args, **kwargs):
t1 = time.time()
res = func(*args, **kwargs)
t2 = time.time()
if len(args) < 2:
spec = ''
elif args[1] < 1024 * 1024:
spec = '{} kb'.format(float(args[1]) / 1024)
else:
spec = '{} Mb'.format(float(args[1]) / 1024 / 1024)
print '{}[{}] took {:0.3f} ms'.format(func.func_name, spec, (t2-t1)*1000.0)
return res
return wrapper
def read_in_chunks(file_object, chunk_size=1024):
"""Lazy function (generator) to read a file piece by piece.
Default chunk size: 1k."""
while True:
data = file_object.read(chunk_size)
if not data:
break
yield data
@print_timing
def md5_separate_read(filename, block_size=2**20):
with open(filename, 'rb') as f:
m = hashlib.md5()
for data in read_in_chunks(f, block_size):
m.update(data)
return m.hexdigest()
@print_timing
def md5_single_read(filename):
with open(filename, 'rb') as f:
return hashlib.md5(f.read()).hexdigest()
if __name__ == '__main__':
# dd if=/dev/zero of=large.data bs=1M count=1000
filename = 'large.data'
dflt_interval = 0.5
# 1 kb
print memory_usage((md5_separate_read, (filename, 1<<10), {}), interval=dflt_interval)
# 4 kb
print memory_usage((md5_separate_read, (filename, 1<<12), {}), interval=dflt_interval)
# 16 kb
print memory_usage((md5_separate_read, (filename, 1<<14), {}), interval=dflt_interval)
# 64 kb
print memory_usage((md5_separate_read, (filename, 1<<16), {}), interval=dflt_interval)
# 512 kb
print memory_usage((md5_separate_read, (filename, 1<<19), {}), interval=dflt_interval)
# 1 Mb
print memory_usage((md5_separate_read, (filename, 1<<20), {}), interval=dflt_interval)
# 2 Mb
print memory_usage((md5_separate_read, (filename, 1<<21), {}), interval=dflt_interval)
# 4 Mb
print memory_usage((md5_separate_read, (filename, 1<<22), {}), interval=dflt_interval)
# 8 Mb
print memory_usage((md5_separate_read, (filename, 1<<23), {}), interval=dflt_interval)
# 16 Mb
print memory_usage((md5_separate_read, (filename, 1<<24), {}), interval=dflt_interval)
print memory_usage((md5_single_read, (filename, ), {}), interval=dflt_interval)
1Gb Data
md5_separate_read[1.0 kb] took 4994.983 ms
[9.1171875, 9.13671875, 9.13671875, 9.13671875, 9.13671875, 9.13671875, 9.13671875, 9.13671875, 9.13671875, 9.13671875, 9.13671875, 9.140625]
md5_separate_read[4.0 kb] took 3494.183 ms
[9.171875, 9.171875, 9.171875, 9.171875, 9.171875, 9.171875, 9.171875, 9.171875, 9.17578125]
md5_separate_read[16.0 kb] took 2951.646 ms
[9.17578125, 9.17578125, 9.17578125, 9.17578125, 9.17578125, 9.17578125, 9.17578125, 9.19921875]
md5_separate_read[64.0 kb] took 3344.379 ms
[9.19921875, 9.19921875, 9.28125, 9.28125, 9.28125, 9.28125, 9.28125, 9.28125, 9.29296875]
md5_separate_read[512.0 kb] took 3038.037 ms
[9.29296875, 9.6640625, 9.921875, 9.921875, 9.921875, 9.921875, 9.921875, 9.921875, 9.29296875]
md5_separate_read[1.0 Mb] took 2679.496 ms
[9.29296875, 9.29296875, 10.9375, 10.9375, 10.9375, 10.9375, 10.9375, 9.29296875]
md5_separate_read[2.0 Mb] took 2414.020 ms
[9.29296875, 9.9296875, 12.96875, 12.96875, 12.96875, 12.96875, 9.29296875]
md5_separate_read[4.0 Mb] took 2590.301 ms
[9.29296875, 9.6640625, 17.03515625, 17.03515625, 17.03515625, 17.03515625, 17.03515625, 9.29296875]
md5_separate_read[8.0 Mb] took 3658.132 ms
[9.29296875, 10.4375, 25.16015625, 25.16015625, 25.16015625, 25.16015625, 25.16015625, 25.16015625, 25.16015625, 9.29296875]
md5_separate_read[16.0 Mb] took 2469.210 ms
[9.29296875, 9.66015625, 41.15234375, 41.15234375, 41.15234375, 41.15234375, 9.296875]
md5_single_read[] took 3316.208 ms
[9.296875, 9.421875, 389.1796875, 889.3359375, 1009.21875, 1009.21875, 1009.21875, 1009.21875, 9.296875]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment