Skip to content

Instantly share code, notes, and snippets.

@danielpodrazka
Last active December 22, 2022 16:15
Show Gist options
  • Save danielpodrazka/d65e77c07963a26f5672630b36e36caf to your computer and use it in GitHub Desktop.
Save danielpodrazka/d65e77c07963a26f5672630b36e36caf to your computer and use it in GitHub Desktop.
Store the file below in your project and import it to the script that you are currently working on. I wrote an article about this gist: https://medium.com/@daniep/speed-up-developing-python-etls-with-invisible-cache-eb2eaadf6918
import os
import pandas as pd
from diskcache import FanoutCache
from diskcache.core import ENOVAL, args_to_key, ft, full_name
class cconnect(object):
def __init__(self):
self.begin = self.__enter__
def __enter__(self):
return self
def __exit__(self, type, value, traceback):
...
def memoize(
self, name=None, typed=False, expire=None, tag=None, ignore=(), refresh=False
):
if callable(name):
raise TypeError("name cannot be callable")
def decorator(func):
"""Decorator created by memoize() for callable `func`."""
base = (full_name(func),) if name is None else (name,)
@ft.wraps(func)
def wrapper(*args, **kwargs):
"""Wrapper for callable to cache arguments and return values."""
key = wrapper.__cache_key__(*args, **kwargs)
if refresh and self.get(key) is not None:
del self[key]
result = self.get(key, default=ENOVAL, retry=True)
if result is ENOVAL:
result = func(*args, **kwargs)
if expire is None or expire > 0:
try:
self.set(key, result, expire, tag=tag, retry=True)
except TypeError as e:
print(f"diskcache: Couldn't cache {key}")
return result
def __cache_key__(*args, **kwargs):
"""Make key for cache given function arguments."""
return args_to_key(base, args, kwargs, typed, ignore)
wrapper.__cache_key__ = __cache_key__
return wrapper
return decorator
FanoutCache.memoize = memoize
DEFAULT_CACHE_SIZE = (2 ** 30) * 50 # 50GB
cache = FanoutCache(
os.environ.get(
"LOCAL_CACHE_PATH", os.path.join(os.path.expanduser("~"), "localcache")
),
size_limit=os.environ.get("LOCAL_CACHE_SIZE", DEFAULT_CACHE_SIZE),
)
cache_functions = [
"read_feather",
"read_excel",
"read_csv",
"read_table",
"read_sql",
"read_sql_query",
"read_sql_table",
"read_parquet",
"read_json",
]
for func in cache_functions:
org_func = eval(f"pd.{func}")
if func in ["read_sql", "read_sql_query", "read_sql_table"]:
exec(f"pd.{func} = cache.memoize(ignore=(1,))(org_func)")
exec(f"pd.re{func} = cache.memoize(refresh=True, ignore=(1,))(org_func)")
else:
exec(f"pd.{func} = cache.memoize()(org_func)")
exec(f"pd.re{func} = cache.memoize(refresh=True)(org_func)")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment