Skip to content

Instantly share code, notes, and snippets.

@RokoMijic
Created July 6, 2020 12:23
Show Gist options
  • Save RokoMijic/70186e7f7a05acfe2ed2261b9f4b1ffb to your computer and use it in GitHub Desktop.
Save RokoMijic/70186e7f7a05acfe2ed2261b9f4b1ffb to your computer and use it in GitHub Desktop.
def efficient_hash_df(df, index=True, small_frame_limit=99999, num_chars=10):
'''
Quickly produces a deterministic hash of a dataframe in a nice readable hash format
Larger DataFrames get sampled for efficiency, but this should usually be OK.
'''
if df.shape[0] <= small_frame_limit:
hexdigest = hashlib.sha1(pd.util.hash_pandas_object(df , index=index).values).hexdigest()
else:
combined_hash = hashlib.sha1()
df_sample = df.sample(n=small_frame_limit, replace=True, random_state=0)
digest_sample = hashlib.sha1(pd.util.hash_pandas_object(df_sample, index=index).values).digest() #.hexdigest()
combined_hash.update(digest_sample)
sums = [df[c].sum() for c in df.columns.tolist() if np.issubdtype(df[c].dtype, np.number)]
digest_sums = hashlib.sha1( np.array(sums ) ).digest() #.hexdigest()
combined_hash.update(digest_sums)
hexdigest = combined_hash.hexdigest()
b64digest = codecs.encode(codecs.decode(hexdigest, 'hex'), 'base64').decode().replace("\n", "").replace("=", "").replace("/", "$").replace("+", "&")
return b64digest[:num_chars]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment