JohannesBuchner/codememoize.py

## codememoize.py
"""
Memoizes a given function, given its code dependencies (loaded modules and
additional data files)

Example::

  import douglasadams
  def costlyfunction():
  	# compute answer to the universe and everything
  	return douglasadams.compute_answer() == 42

  costlyfunction_mem = codememoize(costlyfunction)

  # run -- first time, it will run the code and store it
  costlyfunction_mem()
  # run -- second time, it will load from storage
  costlyfunction_mem()
  # if the douglasadams lib has been changed, it will run again

Example 2::

  def countwords_in_myfile():
  	n = 0
  	for line in open('myfile.txt'):
  		n += len(line.rstrip().split(' '))
  	return n

  # will memoize based on the id, the fingerprint of all loaded code,
  #      the hash of myfile, and re-run if any of those has changed
  codememoize(countwords_in_myfile, id='countwords', depfiles=['myfile.txt'])


"""

import sys
import os
import joblib
import hashlib
from StringIO import StringIO

cachedir = 'cache'

def hashfile(afile, hasher, blocksize=65536):
	buf = afile.read(blocksize)
	while len(buf) > 0:
		hasher.update(buf)
		buf = afile.read(blocksize)
	return hasher.hexdigest()

def memoized_call(func, id='memo', deps = []):
	depsstr = '\n'.join(deps)
	hashdeps = hashfile(StringIO(depsstr), hashlib.sha256())
	path = cachedir
	name = 'mem_%s_%s' % (func.__name__, id)
	if not os.path.isdir(path): os.mkdir(path)
	path = os.path.join(path, '%s_%s' % (name, hashdeps[:6]))
	if not os.path.isdir(path): os.mkdir(path)

	deppath = os.path.join(path, 'deps')
	if not os.path.exists(deppath):
		open(deppath, 'w').write(depsstr)

	print 'memoization at:', path
	mem = joblib.Memory(cachedir=path, verbose=True)
	memfunc = mem.cache(func)
	memfunc.__name__ = name
	return memfunc

def codememoize(func, id='memo', depfiles=[]):
	for k in sorted(sys.modules.values()):
		if not k: continue
		if not hasattr(k, '__file__'): continue
		# skip modules that are system-installed
		# these are considered "safe" here.
		if k.__file__.startswith('/usr/lib'): continue
		# skip test modules
		# if they change, that does not mean the main code changed
		if 'test' in k.__file__: continue
		f = k.__file__

		# look for the source code if available
		# this is needed because in the first run, modules are .py,
		# but in the second run, the same modules are loaded from .pyc
		# although they have not been changed they would be run again.
		fsrc = f.replace('.pyc', '.py')

		# skip this code
		if 'codememoize.py' in fsrc: continue
		if not os.path.exists(fsrc):
			f = fsrc
		depfiles.append(fsrc)
	deps = []
	for f in depfiles:
		try:
			fhash = hashfile(open(f, 'rb'), hashlib.sha256())
		except IOError:
			continue
		deps.append(fhash)
		print 'dependency:', f, fhash
	return memoized_call(func, id=id, deps=deps)
	"""
	Memoizes a given function, given its code dependencies (loaded modules and
	additional data files)

	Example::

	import douglasadams
	def costlyfunction():
	# compute answer to the universe and everything
	return douglasadams.compute_answer() == 42

	costlyfunction_mem = codememoize(costlyfunction)

	# run -- first time, it will run the code and store it
	costlyfunction_mem()
	# run -- second time, it will load from storage
	costlyfunction_mem()
	# if the douglasadams lib has been changed, it will run again

	Example 2::

	def countwords_in_myfile():
	n = 0
	for line in open('myfile.txt'):
	n += len(line.rstrip().split(' '))
	return n

	# will memoize based on the id, the fingerprint of all loaded code,
	# the hash of myfile, and re-run if any of those has changed
	codememoize(countwords_in_myfile, id='countwords', depfiles=['myfile.txt'])


	"""

	import sys
	import os
	import joblib
	import hashlib
	from StringIO import StringIO

	cachedir = 'cache'

	def hashfile(afile, hasher, blocksize=65536):
	buf = afile.read(blocksize)
	while len(buf) > 0:
	hasher.update(buf)
	buf = afile.read(blocksize)
	return hasher.hexdigest()

	def memoized_call(func, id='memo', deps = []):
	depsstr = '\n'.join(deps)
	hashdeps = hashfile(StringIO(depsstr), hashlib.sha256())
	path = cachedir
	name = 'mem_%s_%s' % (func.__name__, id)
	if not os.path.isdir(path): os.mkdir(path)
	path = os.path.join(path, '%s_%s' % (name, hashdeps[:6]))
	if not os.path.isdir(path): os.mkdir(path)

	deppath = os.path.join(path, 'deps')
	if not os.path.exists(deppath):
	open(deppath, 'w').write(depsstr)

	print 'memoization at:', path
	mem = joblib.Memory(cachedir=path, verbose=True)
	memfunc = mem.cache(func)
	memfunc.__name__ = name
	return memfunc

	def codememoize(func, id='memo', depfiles=[]):
	for k in sorted(sys.modules.values()):
	if not k: continue
	if not hasattr(k, '__file__'): continue
	# skip modules that are system-installed
	# these are considered "safe" here.
	if k.__file__.startswith('/usr/lib'): continue
	# skip test modules
	# if they change, that does not mean the main code changed
	if 'test' in k.__file__: continue
	f = k.__file__

	# look for the source code if available
	# this is needed because in the first run, modules are .py,
	# but in the second run, the same modules are loaded from .pyc
	# although they have not been changed they would be run again.
	fsrc = f.replace('.pyc', '.py')

	# skip this code
	if 'codememoize.py' in fsrc: continue
	if not os.path.exists(fsrc):
	f = fsrc
	depfiles.append(fsrc)
	deps = []
	for f in depfiles:
	try:
	fhash = hashfile(open(f, 'rb'), hashlib.sha256())
	except IOError:
	continue
	deps.append(fhash)
	print 'dependency:', f, fhash
	return memoized_call(func, id=id, deps=deps)