Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save dbreunig/77d8c991f596ae17a8732884e585a02d to your computer and use it in GitHub Desktop.
Save dbreunig/77d8c991f596ae17a8732884e585a02d to your computer and use it in GitHub Desktop.
# REGEXP_EXTRACT(i.Address, '^[0-9]+') AS i_address_number,
jws_matches_df = con.sql("""
WITH ranked_matches AS (
SELECT
i.Facility_ID as fid,
p.id as gers,
i.Facility_Name as i_name,
p.Facility_Name as p_name,
i.Address,
p.Address,
jaro_winkler_similarity(i.Facility_Name, p.Facility_Name) as name_similarity,
jaro_winkler_similarity(i.Address, p.Address) as address_similarity,
CASE
WHEN REGEXP_EXTRACT(i.Address, '^[0-9]+') = REGEXP_EXTRACT(p.Address, '^[0-9]+') THEN 1
ELSE 0
END AS address_number_match,
ROW_NUMBER() OVER (
PARTITION BY i.Facility_ID
ORDER BY jaro_winkler_similarity(i.Facility_Name, p.Facility_Name) DESC
) as rank
FROM (
SELECT DISTINCT Facility_Name, Facility_ID, Address, h3
FROM inspections
) i
JOIN places p
ON i.h3 = p.h3
AND jaro_winkler_similarity(i.Facility_Name, p.Facility_Name) > 0.83
AND jaro_winkler_similarity(i.Address, p.Address) > 0.75
),
filtered_matches AS (
SELECT
fid,
gers,
i_name,
p_name,
Address,
address_similarity,
name_similarity,
address_number_match,
rank,
ROW_NUMBER() OVER (
PARTITION BY fid
ORDER BY
CASE
WHEN address_number_match = 1 THEN 1
ELSE 2
END,
rank
) as new_rank
FROM ranked_matches
)
SELECT
fid,
gers,
i_name,
p_name,
Address,
address_similarity,
name_similarity,
address_number_match
FROM filtered_matches
WHERE new_rank = 1
""").df()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment