Created
September 29, 2024 18:28
Revisions
-
dbreunig created this gist
Sep 29, 2024 .There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,63 @@ # REGEXP_EXTRACT(i.Address, '^[0-9]+') AS i_address_number, jws_matches_df = con.sql(""" WITH ranked_matches AS ( SELECT i.Facility_ID as fid, p.id as gers, i.Facility_Name as i_name, p.Facility_Name as p_name, i.Address, p.Address, jaro_winkler_similarity(i.Facility_Name, p.Facility_Name) as name_similarity, jaro_winkler_similarity(i.Address, p.Address) as address_similarity, CASE WHEN REGEXP_EXTRACT(i.Address, '^[0-9]+') = REGEXP_EXTRACT(p.Address, '^[0-9]+') THEN 1 ELSE 0 END AS address_number_match, ROW_NUMBER() OVER ( PARTITION BY i.Facility_ID ORDER BY jaro_winkler_similarity(i.Facility_Name, p.Facility_Name) DESC ) as rank FROM ( SELECT DISTINCT Facility_Name, Facility_ID, Address, h3 FROM inspections ) i JOIN places p ON i.h3 = p.h3 AND jaro_winkler_similarity(i.Facility_Name, p.Facility_Name) > 0.83 AND jaro_winkler_similarity(i.Address, p.Address) > 0.75 ), filtered_matches AS ( SELECT fid, gers, i_name, p_name, Address, address_similarity, name_similarity, address_number_match, rank, ROW_NUMBER() OVER ( PARTITION BY fid ORDER BY CASE WHEN address_number_match = 1 THEN 1 ELSE 2 END, rank ) as new_rank FROM ranked_matches ) SELECT fid, gers, i_name, p_name, Address, address_similarity, name_similarity, address_number_match FROM filtered_matches WHERE new_rank = 1 """).df()