Skip to content

Instantly share code, notes, and snippets.

@JohannesBuchner
Last active August 29, 2015 14:21
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save JohannesBuchner/0056d4c2b858470c393c to your computer and use it in GitHub Desktop.
Save JohannesBuchner/0056d4c2b858470c393c to your computer and use it in GitHub Desktop.
For tests/builds that should only re-run when code or data files have changed (memoized tests)
"""
Memoizes a given function, given its code dependencies (loaded modules and
additional data files)
Example::
import douglasadams
def costlyfunction():
# compute answer to the universe and everything
return douglasadams.compute_answer() == 42
costlyfunction_mem = codememoize(costlyfunction)
# run -- first time, it will run the code and store it
costlyfunction_mem()
# run -- second time, it will load from storage
costlyfunction_mem()
# if the douglasadams lib has been changed, it will run again
Example 2::
def countwords_in_myfile():
n = 0
for line in open('myfile.txt'):
n += len(line.rstrip().split(' '))
return n
# will memoize based on the id, the fingerprint of all loaded code,
# the hash of myfile, and re-run if any of those has changed
codememoize(countwords_in_myfile, id='countwords', depfiles=['myfile.txt'])
"""
import sys
import os
import joblib
import hashlib
from StringIO import StringIO
cachedir = 'cache'
def hashfile(afile, hasher, blocksize=65536):
buf = afile.read(blocksize)
while len(buf) > 0:
hasher.update(buf)
buf = afile.read(blocksize)
return hasher.hexdigest()
def memoized_call(func, id='memo', deps = []):
depsstr = '\n'.join(deps)
hashdeps = hashfile(StringIO(depsstr), hashlib.sha256())
path = cachedir
name = 'mem_%s_%s' % (func.__name__, id)
if not os.path.isdir(path): os.mkdir(path)
path = os.path.join(path, '%s_%s' % (name, hashdeps[:6]))
if not os.path.isdir(path): os.mkdir(path)
deppath = os.path.join(path, 'deps')
if not os.path.exists(deppath):
open(deppath, 'w').write(depsstr)
print 'memoization at:', path
mem = joblib.Memory(cachedir=path, verbose=True)
memfunc = mem.cache(func)
memfunc.__name__ = name
return memfunc
def codememoize(func, id='memo', depfiles=[]):
for k in sorted(sys.modules.values()):
if not k: continue
if not hasattr(k, '__file__'): continue
# skip modules that are system-installed
# these are considered "safe" here.
if k.__file__.startswith('/usr/lib'): continue
# skip test modules
# if they change, that does not mean the main code changed
if 'test' in k.__file__: continue
f = k.__file__
# look for the source code if available
# this is needed because in the first run, modules are .py,
# but in the second run, the same modules are loaded from .pyc
# although they have not been changed they would be run again.
fsrc = f.replace('.pyc', '.py')
# skip this code
if 'codememoize.py' in fsrc: continue
if not os.path.exists(fsrc):
f = fsrc
depfiles.append(fsrc)
deps = []
for f in depfiles:
try:
fhash = hashfile(open(f, 'rb'), hashlib.sha256())
except IOError:
continue
deps.append(fhash)
print 'dependency:', f, fhash
return memoized_call(func, id=id, deps=deps)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment