Last active
August 29, 2015 14:21
-
-
Save JohannesBuchner/0056d4c2b858470c393c to your computer and use it in GitHub Desktop.
For tests/builds that should only re-run when code or data files have changed (memoized tests)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Memoizes a given function, given its code dependencies (loaded modules and | |
additional data files) | |
Example:: | |
import douglasadams | |
def costlyfunction(): | |
# compute answer to the universe and everything | |
return douglasadams.compute_answer() == 42 | |
costlyfunction_mem = codememoize(costlyfunction) | |
# run -- first time, it will run the code and store it | |
costlyfunction_mem() | |
# run -- second time, it will load from storage | |
costlyfunction_mem() | |
# if the douglasadams lib has been changed, it will run again | |
Example 2:: | |
def countwords_in_myfile(): | |
n = 0 | |
for line in open('myfile.txt'): | |
n += len(line.rstrip().split(' ')) | |
return n | |
# will memoize based on the id, the fingerprint of all loaded code, | |
# the hash of myfile, and re-run if any of those has changed | |
codememoize(countwords_in_myfile, id='countwords', depfiles=['myfile.txt']) | |
""" | |
import sys | |
import os | |
import joblib | |
import hashlib | |
from StringIO import StringIO | |
cachedir = 'cache' | |
def hashfile(afile, hasher, blocksize=65536): | |
buf = afile.read(blocksize) | |
while len(buf) > 0: | |
hasher.update(buf) | |
buf = afile.read(blocksize) | |
return hasher.hexdigest() | |
def memoized_call(func, id='memo', deps = []): | |
depsstr = '\n'.join(deps) | |
hashdeps = hashfile(StringIO(depsstr), hashlib.sha256()) | |
path = cachedir | |
name = 'mem_%s_%s' % (func.__name__, id) | |
if not os.path.isdir(path): os.mkdir(path) | |
path = os.path.join(path, '%s_%s' % (name, hashdeps[:6])) | |
if not os.path.isdir(path): os.mkdir(path) | |
deppath = os.path.join(path, 'deps') | |
if not os.path.exists(deppath): | |
open(deppath, 'w').write(depsstr) | |
print 'memoization at:', path | |
mem = joblib.Memory(cachedir=path, verbose=True) | |
memfunc = mem.cache(func) | |
memfunc.__name__ = name | |
return memfunc | |
def codememoize(func, id='memo', depfiles=[]): | |
for k in sorted(sys.modules.values()): | |
if not k: continue | |
if not hasattr(k, '__file__'): continue | |
# skip modules that are system-installed | |
# these are considered "safe" here. | |
if k.__file__.startswith('/usr/lib'): continue | |
# skip test modules | |
# if they change, that does not mean the main code changed | |
if 'test' in k.__file__: continue | |
f = k.__file__ | |
# look for the source code if available | |
# this is needed because in the first run, modules are .py, | |
# but in the second run, the same modules are loaded from .pyc | |
# although they have not been changed they would be run again. | |
fsrc = f.replace('.pyc', '.py') | |
# skip this code | |
if 'codememoize.py' in fsrc: continue | |
if not os.path.exists(fsrc): | |
f = fsrc | |
depfiles.append(fsrc) | |
deps = [] | |
for f in depfiles: | |
try: | |
fhash = hashfile(open(f, 'rb'), hashlib.sha256()) | |
except IOError: | |
continue | |
deps.append(fhash) | |
print 'dependency:', f, fhash | |
return memoized_call(func, id=id, deps=deps) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment