Skip to content

Instantly share code, notes, and snippets.

@DaveRuijter
Created April 3, 2022 07:13
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save DaveRuijter/938c2dda036b81f7d1cdcd6189683e5e to your computer and use it in GitHub Desktop.
Save DaveRuijter/938c2dda036b81f7d1cdcd6189683e5e to your computer and use it in GitHub Desktop.
Couple functions to easily create an integer based hash. Use it for the key column of a dimension.
spark.udf.register("udf_removehtmltagsfromstring", udf_removehtmltagsfromstring, "string")
# This is the central hashing function, used by other functions. It uses the blake2b hashing algorithm. With a central function, we can adjust the hashing when needed.
def udf_centralhash(string: str) -> int:
val = hashlib.blake2b(
digest_size=6
) # Increase digest size to make the hashing bigger. 6 seems a good start for our use for dimensions.
val.update(string.encode("utf-8")) # give the input string as utf-8 to the blake2b object
intval = int(val.hexdigest(), 16) # and convert it to an integer
return intval
spark.udf.register("udf_centralhash", udf_centralhash)
# Add a misc type, and convert it to string
# In case of integer 0, it should convert to string "0" and not empty
def udf_toString(var: str = "") -> str:
if isinstance(var, int):
var = str(var or "0") # The or "0" creates empty strings for integer 0 values passed.
else:
var = str(var or "") # The or "" creates empty strings for NULL values passed.
return var
spark.udf.register("udf_toString", udf_toString)
# Function to use when hashing dimension column values. Either in building a dimension or a fact table.
# For now has 15 keys maximum and can be increased if needed.
# When the keys contains nothing then it returns -1.
def udf_generate_key(
key1: str = "",
key2: str = "",
key3: str = "",
key4: str = "",
key5: str = "",
key6: str = "",
key7: str = "",
key8: str = "",
key9: str = "",
key10: str = "",
key11: str = "",
key12: str = "",
key13: str = "",
key14: str = "",
key15: str = "",
) -> int:
strConcat = "|".join(
[
udf_toString(key1),
udf_toString(key2),
udf_toString(key3),
udf_toString(key4),
udf_toString(key5),
udf_toString(key6),
udf_toString(key7),
udf_toString(key8),
udf_toString(key9),
udf_toString(key10),
udf_toString(key11),
udf_toString(key12),
udf_toString(key13),
udf_toString(key14),
udf_toString(key15),
]
) # Concat all keys seperated by a pipe character.
strConcat = strConcat.rstrip("|") # Strip trailing pipe characters
if strConcat.strip() == "": # Strip trailing spaces and tabs.
return -1
else: # Else hash the string
return udf_centralhash(strConcat) # Using the central hashing function
spark.udf.register("udf_generate_key", udf_generate_key)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment