Created
April 3, 2022 07:13
-
-
Save DaveRuijter/938c2dda036b81f7d1cdcd6189683e5e to your computer and use it in GitHub Desktop.
Couple functions to easily create an integer based hash. Use it for the key column of a dimension.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
spark.udf.register("udf_removehtmltagsfromstring", udf_removehtmltagsfromstring, "string") | |
# This is the central hashing function, used by other functions. It uses the blake2b hashing algorithm. With a central function, we can adjust the hashing when needed. | |
def udf_centralhash(string: str) -> int: | |
val = hashlib.blake2b( | |
digest_size=6 | |
) # Increase digest size to make the hashing bigger. 6 seems a good start for our use for dimensions. | |
val.update(string.encode("utf-8")) # give the input string as utf-8 to the blake2b object | |
intval = int(val.hexdigest(), 16) # and convert it to an integer | |
return intval | |
spark.udf.register("udf_centralhash", udf_centralhash) | |
# Add a misc type, and convert it to string | |
# In case of integer 0, it should convert to string "0" and not empty | |
def udf_toString(var: str = "") -> str: | |
if isinstance(var, int): | |
var = str(var or "0") # The or "0" creates empty strings for integer 0 values passed. | |
else: | |
var = str(var or "") # The or "" creates empty strings for NULL values passed. | |
return var | |
spark.udf.register("udf_toString", udf_toString) | |
# Function to use when hashing dimension column values. Either in building a dimension or a fact table. | |
# For now has 15 keys maximum and can be increased if needed. | |
# When the keys contains nothing then it returns -1. | |
def udf_generate_key( | |
key1: str = "", | |
key2: str = "", | |
key3: str = "", | |
key4: str = "", | |
key5: str = "", | |
key6: str = "", | |
key7: str = "", | |
key8: str = "", | |
key9: str = "", | |
key10: str = "", | |
key11: str = "", | |
key12: str = "", | |
key13: str = "", | |
key14: str = "", | |
key15: str = "", | |
) -> int: | |
strConcat = "|".join( | |
[ | |
udf_toString(key1), | |
udf_toString(key2), | |
udf_toString(key3), | |
udf_toString(key4), | |
udf_toString(key5), | |
udf_toString(key6), | |
udf_toString(key7), | |
udf_toString(key8), | |
udf_toString(key9), | |
udf_toString(key10), | |
udf_toString(key11), | |
udf_toString(key12), | |
udf_toString(key13), | |
udf_toString(key14), | |
udf_toString(key15), | |
] | |
) # Concat all keys seperated by a pipe character. | |
strConcat = strConcat.rstrip("|") # Strip trailing pipe characters | |
if strConcat.strip() == "": # Strip trailing spaces and tabs. | |
return -1 | |
else: # Else hash the string | |
return udf_centralhash(strConcat) # Using the central hashing function | |
spark.udf.register("udf_generate_key", udf_generate_key) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment