Skip to content

Instantly share code, notes, and snippets.

@kazimuth
Created May 8, 2019 05:08
Show Gist options
  • Save kazimuth/7bbf5d8593f23aac2979a4d1be5e8d43 to your computer and use it in GitHub Desktop.
Save kazimuth/7bbf5d8593f23aac2979a4d1be5e8d43 to your computer and use it in GitHub Desktop.
filesystem persistence for python science stuff
from typing import Any, Union
import hashlib
import pandas
import numpy as np
import json
import os
from pathlib import Path
def _load_json(path: Path):
return json.load(path.open('r'))
def _store_json(path: Path, obj: Any):
if isinstance(obj, dict) or isinstance(obj, list) \
or isinstance(obj, str) or isinstance(obj, int) \
or isinstance(obj, float):
json.dump(obj, path.open('a'))
return True
def _load_npy(path: Path):
return np.load(path)
def _store_npy(path: Path, arr: Any):
if isinstance(arr, np.ndarray):
np.save(path, arr)
return True
def _load_parquet(path: Path):
return pandas.read_parquet(str(path))
def _store_parquet(path: Path, df: Any):
if isinstance(df, pandas.DataFrame):
df.to_parquet(path)
return True
def _load_datastore(path: Path):
if path.is_dir():
return DataStore(path)
SERIALIZERS = [
('', _load_datastore, lambda *args: False),
('parquet', _load_parquet, _store_parquet),
('npy', _load_npy, _store_npy),
('json', _load_json, _store_json)
]
class DataStore:
def __init__(self, path: Union[Path, str]):
if isinstance(path, str):
path = Path(path)
self.path = path
if not self.path.exists():
os.makedirs(str(self.path))
def _validate(self, key: str):
if '..' in key or '/./' in key:
raise Exception(f'invalid key: "{key}"')
def __getitem__(self, key: str):
self._validate(key)
for suffix, load, _ in SERIALIZERS:
path = self.path / f'{key}.{suffix}'
if path.is_file():
result = load(path)
if result is not None:
return result
def __setitem__(self, key: str, obj: Any):
self._validate(key)
for suffix, _, _ in SERIALIZERS:
path = self.path / f'{key}.{suffix}'
if path.is_file():
path.unlink()
# note: pathlib splits paths for us
parent = (self.path / key).parent
if not parent.exists():
os.makedirs(parent)
for suffix, _, store in SERIALIZERS:
path = self.path / f'{key}.{suffix}'
if store(self.path / f'{key}.{suffix}', obj):
return
raise Exception(f'failed to serialize {obj} to {self.path}/{key}')
def make(self, key: str):
return DataStore(self.path / key)
STORE = DataStore('data')
CACHE = STORE.make('fsmemo')
def ident(f):
code_hash = hashlib.sha3_224(f.__code__.co_code).hexdigest()[:8]
return f'{f.__module__}.{f.__name__}.{code_hash}'
def fsmemo(fun):
fid = ident(fun)
def wrapper(*args, **kwargs):
memo = json.dumps(list(args) + list(sorted((k,v) for k,v in kwargs.items())))
key = hashlib.sha3_224(memo.encode('utf-8')).hexdigest()[:16]
if CACHE[f'{fid}/{key}/memo'] == memo:
return CACHE[f'{fid}/{key}/result']
result = fun(*args, **kwargs)
CACHE[f'{fid}/{key}/memo'] = memo
CACHE[f'{fid}/{key}/result'] = result
return result
return wrapper
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment