Created
July 6, 2020 12:23
-
-
Save RokoMijic/70186e7f7a05acfe2ed2261b9f4b1ffb to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def efficient_hash_df(df, index=True, small_frame_limit=99999, num_chars=10): | |
''' | |
Quickly produces a deterministic hash of a dataframe in a nice readable hash format | |
Larger DataFrames get sampled for efficiency, but this should usually be OK. | |
''' | |
if df.shape[0] <= small_frame_limit: | |
hexdigest = hashlib.sha1(pd.util.hash_pandas_object(df , index=index).values).hexdigest() | |
else: | |
combined_hash = hashlib.sha1() | |
df_sample = df.sample(n=small_frame_limit, replace=True, random_state=0) | |
digest_sample = hashlib.sha1(pd.util.hash_pandas_object(df_sample, index=index).values).digest() #.hexdigest() | |
combined_hash.update(digest_sample) | |
sums = [df[c].sum() for c in df.columns.tolist() if np.issubdtype(df[c].dtype, np.number)] | |
digest_sums = hashlib.sha1( np.array(sums ) ).digest() #.hexdigest() | |
combined_hash.update(digest_sums) | |
hexdigest = combined_hash.hexdigest() | |
b64digest = codecs.encode(codecs.decode(hexdigest, 'hex'), 'base64').decode().replace("\n", "").replace("=", "").replace("/", "$").replace("+", "&") | |
return b64digest[:num_chars] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment