DaveRuijter/generate_hash.py

## generate_hash.py

spark.udf.register("udf_removehtmltagsfromstring", udf_removehtmltagsfromstring, "string")

# This is the central hashing function, used by other functions. It uses the blake2b hashing algorithm. With a central function, we can adjust the hashing when needed.
def udf_centralhash(string: str) -> int:
    val = hashlib.blake2b(
        digest_size=6
    )  # Increase digest size to make the hashing bigger. 6 seems a good start for our use for dimensions.
    val.update(string.encode("utf-8"))  # give the input string as utf-8 to the blake2b object
    intval = int(val.hexdigest(), 16)  # and convert it to an integer
    return intval


spark.udf.register("udf_centralhash", udf_centralhash)


# Add a misc type, and convert it to string
# In case of integer 0, it should convert to string "0" and not empty
def udf_toString(var: str = "") -> str:
    if isinstance(var, int):
        var = str(var or "0")  # The or "0" creates empty strings for integer 0 values passed.
    else:
        var = str(var or "")  # The or "" creates empty strings for NULL values passed.
    return var


spark.udf.register("udf_toString", udf_toString)


# Function to use when hashing dimension column values. Either in building a dimension or a fact table.
# For now has 15 keys maximum and can be increased if needed.
# When the keys contains nothing then it returns -1.
def udf_generate_key(
    key1: str = "",
    key2: str = "",
    key3: str = "",
    key4: str = "",
    key5: str = "",
    key6: str = "",
    key7: str = "",
    key8: str = "",
    key9: str = "",
    key10: str = "",
    key11: str = "",
    key12: str = "",
    key13: str = "",
    key14: str = "",
    key15: str = "",
) -> int:
    strConcat = "|".join(
        [
            udf_toString(key1),
            udf_toString(key2),
            udf_toString(key3),
            udf_toString(key4),
            udf_toString(key5),
            udf_toString(key6),
            udf_toString(key7),
            udf_toString(key8),
            udf_toString(key9),
            udf_toString(key10),
            udf_toString(key11),
            udf_toString(key12),
            udf_toString(key13),
            udf_toString(key14),
            udf_toString(key15),
        ]
    )  # Concat all keys seperated by a pipe character.
    strConcat = strConcat.rstrip("|")  # Strip trailing pipe characters
    if strConcat.strip() == "":  # Strip trailing spaces and tabs.
        return -1
    else:  # Else hash the string
        return udf_centralhash(strConcat)  # Using the central hashing function


spark.udf.register("udf_generate_key", udf_generate_key)

	spark.udf.register("udf_removehtmltagsfromstring", udf_removehtmltagsfromstring, "string")

	# This is the central hashing function, used by other functions. It uses the blake2b hashing algorithm. With a central function, we can adjust the hashing when needed.
	def udf_centralhash(string: str) -> int:
	val = hashlib.blake2b(
	digest_size=6
	) # Increase digest size to make the hashing bigger. 6 seems a good start for our use for dimensions.
	val.update(string.encode("utf-8")) # give the input string as utf-8 to the blake2b object
	intval = int(val.hexdigest(), 16) # and convert it to an integer
	return intval


	spark.udf.register("udf_centralhash", udf_centralhash)


	# Add a misc type, and convert it to string
	# In case of integer 0, it should convert to string "0" and not empty
	def udf_toString(var: str = "") -> str:
	if isinstance(var, int):
	var = str(var or "0") # The or "0" creates empty strings for integer 0 values passed.
	else:
	var = str(var or "") # The or "" creates empty strings for NULL values passed.
	return var


	spark.udf.register("udf_toString", udf_toString)


	# Function to use when hashing dimension column values. Either in building a dimension or a fact table.
	# For now has 15 keys maximum and can be increased if needed.
	# When the keys contains nothing then it returns -1.
	def udf_generate_key(
	key1: str = "",
	key2: str = "",
	key3: str = "",
	key4: str = "",
	key5: str = "",
	key6: str = "",
	key7: str = "",
	key8: str = "",
	key9: str = "",
	key10: str = "",
	key11: str = "",
	key12: str = "",
	key13: str = "",
	key14: str = "",
	key15: str = "",
	) -> int:
	strConcat = "\|".join(
	[
	udf_toString(key1),
	udf_toString(key2),
	udf_toString(key3),
	udf_toString(key4),
	udf_toString(key5),
	udf_toString(key6),
	udf_toString(key7),
	udf_toString(key8),
	udf_toString(key9),
	udf_toString(key10),
	udf_toString(key11),
	udf_toString(key12),
	udf_toString(key13),
	udf_toString(key14),
	udf_toString(key15),
	]
	) # Concat all keys seperated by a pipe character.
	strConcat = strConcat.rstrip("\|") # Strip trailing pipe characters
	if strConcat.strip() == "": # Strip trailing spaces and tabs.
	return -1
	else: # Else hash the string
	return udf_centralhash(strConcat) # Using the central hashing function


	spark.udf.register("udf_generate_key", udf_generate_key)