Skip to content

Instantly share code, notes, and snippets.

Revisions

  1. dbreunig created this gist Sep 29, 2024.
    63 changes: 63 additions & 0 deletions conflation_demo_jwt_matches_with_address_number_filter.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,63 @@
    # REGEXP_EXTRACT(i.Address, '^[0-9]+') AS i_address_number,
    jws_matches_df = con.sql("""
    WITH ranked_matches AS (
    SELECT
    i.Facility_ID as fid,
    p.id as gers,
    i.Facility_Name as i_name,
    p.Facility_Name as p_name,
    i.Address,
    p.Address,
    jaro_winkler_similarity(i.Facility_Name, p.Facility_Name) as name_similarity,
    jaro_winkler_similarity(i.Address, p.Address) as address_similarity,
    CASE
    WHEN REGEXP_EXTRACT(i.Address, '^[0-9]+') = REGEXP_EXTRACT(p.Address, '^[0-9]+') THEN 1
    ELSE 0
    END AS address_number_match,
    ROW_NUMBER() OVER (
    PARTITION BY i.Facility_ID
    ORDER BY jaro_winkler_similarity(i.Facility_Name, p.Facility_Name) DESC
    ) as rank
    FROM (
    SELECT DISTINCT Facility_Name, Facility_ID, Address, h3
    FROM inspections
    ) i
    JOIN places p
    ON i.h3 = p.h3
    AND jaro_winkler_similarity(i.Facility_Name, p.Facility_Name) > 0.83
    AND jaro_winkler_similarity(i.Address, p.Address) > 0.75
    ),
    filtered_matches AS (
    SELECT
    fid,
    gers,
    i_name,
    p_name,
    Address,
    address_similarity,
    name_similarity,
    address_number_match,
    rank,
    ROW_NUMBER() OVER (
    PARTITION BY fid
    ORDER BY
    CASE
    WHEN address_number_match = 1 THEN 1
    ELSE 2
    END,
    rank
    ) as new_rank
    FROM ranked_matches
    )
    SELECT
    fid,
    gers,
    i_name,
    p_name,
    Address,
    address_similarity,
    name_similarity,
    address_number_match
    FROM filtered_matches
    WHERE new_rank = 1
    """).df()