Skip to content

Instantly share code, notes, and snippets.

@wassname
Last active March 20, 2020 00:11
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save wassname/ca01b4e1f7ea403a3fc2c12fba7d4b0d to your computer and use it in GitHub Desktop.
Save wassname/ca01b4e1f7ea403a3fc2c12fba7d4b0d to your computer and use it in GitHub Desktop.
Hash pandas and numpy objects in a way that persists between interpreaters (for caching)
import pandas as pd
import numpy as np
import hashlib
import json
def default(o):
"""Sets are unordered so are no good for hasing"""
if isinstance(o, set):
try:
o = sorted(o)
except:
raise Exception('set is not orderable')
return pd.io.json.dumps(o)
else:
return pd.io.json.dumps(o)
def transform_data(obj):
if hasattr(obj, '__iter__'):
return
def to_hash(obj):
"""Hash most python objects that persists between sessions"""
s = json.dumps(obj, default=default, sort_keys=True).encode("utf-8")
m = hashlib.md5(s)
return int(m.hexdigest(), 16) % 10 ** 8
# test, run this in diff interpreters to make sure it persists between sessions
test_objs = [pd.date_range('2019', '2020', freq='Q', tz='utc'),
pd.date_range('2019', '2020', freq='Q', tz='US/Eastern'),
pd.date_range('2019', '2020', freq='Q'),
np.zeros((10,3)),
set([1,3,2]),
# set([1,3,2,'a', '10', 'f', 'ii', 'aa']),
pd.Index([1, 3, 2, 'a']),
dict(c=1, b='b', a=[]),
dict(),
[1, 't', 2, 'a']]
for i, obj in enumerate(test_objs):
print(i, to_hash(obj))
# also see dataframe pandas.util.hash_pandas_object pandas.util.hash_array, from dask.base import tokenize, normalize_token
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment