Created
September 29, 2024 18:28
-
-
Save dbreunig/77d8c991f596ae17a8732884e585a02d to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# REGEXP_EXTRACT(i.Address, '^[0-9]+') AS i_address_number, | |
jws_matches_df = con.sql(""" | |
WITH ranked_matches AS ( | |
SELECT | |
i.Facility_ID as fid, | |
p.id as gers, | |
i.Facility_Name as i_name, | |
p.Facility_Name as p_name, | |
i.Address, | |
p.Address, | |
jaro_winkler_similarity(i.Facility_Name, p.Facility_Name) as name_similarity, | |
jaro_winkler_similarity(i.Address, p.Address) as address_similarity, | |
CASE | |
WHEN REGEXP_EXTRACT(i.Address, '^[0-9]+') = REGEXP_EXTRACT(p.Address, '^[0-9]+') THEN 1 | |
ELSE 0 | |
END AS address_number_match, | |
ROW_NUMBER() OVER ( | |
PARTITION BY i.Facility_ID | |
ORDER BY jaro_winkler_similarity(i.Facility_Name, p.Facility_Name) DESC | |
) as rank | |
FROM ( | |
SELECT DISTINCT Facility_Name, Facility_ID, Address, h3 | |
FROM inspections | |
) i | |
JOIN places p | |
ON i.h3 = p.h3 | |
AND jaro_winkler_similarity(i.Facility_Name, p.Facility_Name) > 0.83 | |
AND jaro_winkler_similarity(i.Address, p.Address) > 0.75 | |
), | |
filtered_matches AS ( | |
SELECT | |
fid, | |
gers, | |
i_name, | |
p_name, | |
Address, | |
address_similarity, | |
name_similarity, | |
address_number_match, | |
rank, | |
ROW_NUMBER() OVER ( | |
PARTITION BY fid | |
ORDER BY | |
CASE | |
WHEN address_number_match = 1 THEN 1 | |
ELSE 2 | |
END, | |
rank | |
) as new_rank | |
FROM ranked_matches | |
) | |
SELECT | |
fid, | |
gers, | |
i_name, | |
p_name, | |
Address, | |
address_similarity, | |
name_similarity, | |
address_number_match | |
FROM filtered_matches | |
WHERE new_rank = 1 | |
""").df() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment