Skip to content

Instantly share code, notes, and snippets.

@rtempleton
Created April 11, 2020 22:00
Show Gist options
  • Save rtempleton/ef72925f3bb04cb2f7df34f183f3efd1 to your computer and use it in GitHub Desktop.
Save rtempleton/ef72925f3bb04cb2f7df34f183f3efd1 to your computer and use it in GitHub Desktop.
use role sysadmin;
-- set up dev environment
create database if not exists my_test_db;
create schema if not exists fuzzy_match;
create warehouse if not exists dev_wh warehouse_size = 'small' auto_suspend = 300 initially_suspended=true;
use schema my_test_db.fuzzy_match;
use warehouse dev_wh;
-- define the standardize function
create or replace function standardize(a String)
returns string
strict immutable
COMMENT = 'Removes non-alphanumeric characters and casts the result to UPPER case'
as $$ select REGEXP_REPLACE(UPPER(a),'[^A-Z0-9 ]', '') $$;
-- create a samples data set derived from the TPCDS sample datasets - standardize these fields while creating the table
create or replace transient table fuzzy_match.samples as(
select
standardize(a.c_customer_sk) as cust_key
, standardize(a.c_current_addr_sk) as add_key
, standardize(a.c_first_name) as fname
, standardize(a.c_last_name) as lname
, standardize(concat(b.ca_street_number, ' ', b.ca_street_name, ' ', b.ca_street_type)) as address
, standardize(b.ca_city) as city
, standardize(b.ca_state) as state
, standardize(b.ca_zip) as zip
from "SNOWFLAKE_SAMPLE_DATA"."TPCDS_SF10TCL"."CUSTOMER" a
inner join "SNOWFLAKE_SAMPLE_DATA"."TPCDS_SF10TCL"."CUSTOMER_ADDRESS" b
on a.c_current_addr_sk = b.ca_address_sk
where a.c_first_name is not null and
a.c_last_name is not null and
address is not null and
ca_state is not null and
ca_zip is not null)
limit 500000; --dial this down to limit the amount of computing that gets used later
-- Define a dictionary table
create or replace transient table dictionary (
token String not null,
term String not null,
context String not null,
constraint UK unique (token, context)
);
/*
Lets look for some candidate names to add to the dictionary
get back all the fnames that are less than 5 characters
*/
select distinct fname from samples where len(fname) < 5;
--MAX is one of the returned values, let's see what other names in the SAMPLES align to MAX
select distinct fname from samples where fname like ('MAX%');
--add some name terms to the dictionary
insert into dictionary values ('MAXINE', 'MAX', 'NAME'), ('MAXIMO', 'MAX', 'NAME'), ('MAXIE', 'MAX', 'NAME'), ('MAXWELL', 'MAX', 'NAME');
insert into dictionary values ('JEFFREY', 'JEFF', 'NAME'), ('JEFFERY', 'JEFF', 'NAME'), ('JEFFRY', 'JEFF', 'NAME'), ('JEFFIE', 'JEFF', 'NAME'), ('JEFFERSON', 'JEFF', 'NAME');
insert into dictionary values ('JACKIE', 'JACK', 'NAME'), ('JACKSON', 'JACK', 'NAME'), ('JACKI', 'JACK', 'NAME'), ('JACKLYN', 'JACK', 'NAME'), ('JACKELYN', 'JACK', 'NAME');
insert into dictionary values ('LEONA', 'LEO', 'NAME'), ('LEONARDO', 'LEO', 'NAME'), ('LEONILA', 'LEO', 'NAME'), ('LEOLA', 'LEO', 'NAME'), ('LEON', 'LEO', 'NAME'), ('LEONARD', 'LEO', 'NAME'), ('LEONIDA', 'LEO', 'NAME');
--add some address terms to the dictionary
insert into dictionary values ('AVENUE', 'AVE', 'ADDRESS'), ('STREET', 'ST', 'ADDRESS'), ('BOULEVARD', 'BLVD', 'ADDRESS'), ('COURT', 'CT', 'ADDRESS'), ('LANE', 'LN', 'ADDRESS'), ('CIRCLE', 'CIR', 'ADDRESS'), ('PARKWAY', 'PKWY', 'ADDRESS'), ('WAY', 'WY', 'ADDRESS'), ('DRIVE', 'DR', 'ADDRESS');
insert into dictionary values ('FIRST', '1ST', 'ADDRESS'), ('SECOND', '2ND', 'ADDRESS'), ('THIRD', '3RD', 'ADDRESS'), ('FOURTH', '4TH', 'ADDRESS'), ('FIFTH', '5TH', 'ADDRESS'), ('SIXTH', '6TH', 'ADDRESS'), ('SEVENTH', '7TH', 'ADDRESS'), ('EIGHTH', '8TH', 'ADDRESS'), ('NINTH', '9TH', 'ADDRESS');
--alter the samples table to hold standardized fname and address values
alter table samples add column fname_std string, address_std string;
--apply the standardized fname to the new fname_std column
update samples s set fname_std = d.name from (
select
a.cust_key,
coalesce(b.term, a.fname) name
from samples a
left join dictionary b
on a.fname = b.token and context = 'NAME') d
where s.cust_key = d.cust_key;
--see the records that were updated
select * from samples where fname != fname_std;
/*
The query below tokenizes the address field from the samples table and then performs lookups into the dictionary table.
It then puts the tokens back together and saves the new standardized address values in a temp table. Note we use the unique
CUST_KEY as there may exist duplicate ADD_KEYS (such in cases of households)
*/
create or replace temp table standard_address as (
select
cust_key,
trim(REGEXP_REPLACE(x,'[^A-Z0-9 ]', ' ')) address from (
select
cust_key,
to_json(array_agg(v) within group (order by idx asc)) x from(
with tokenized as (
select
t.index,
t.value,
s.cust_key
from samples s, lateral split_to_table(s.address, ' ') t
where value > ''
)
select
t.cust_key,
t.index idx,
coalesce(d.term, t.value) v
from tokenized t
left join dictionary d
on t.value = d.token and d.context = 'ADDRESS'
order by 1,2
)
group by cust_key));
select * from standard_address limit 20;
--Join the updated address values back into our samples table
update samples s set s.address_std = a.address from
standard_address a
where s.cust_key = a.cust_key;
--define out fuzzy_scoring UDF
create or replace function fuzzy_score(a String, b String)
returns number
strict
immutable
COMMENT = 'Takes two strings and returns a similarity score between 1 and 0'
as 'select 1.0-(editdistance(a, b)/greatest(length(a),length(b)))';
-- use this query to check the number of pairs that will be generated based on your blocking key
select
sum(pairs) pairs from (
select
state,
count(*) recs,
(count(*)*(count(*)-1)/2)::number(12,0) pairs
from samples
group by 1);
/*
Generate pairs for downstream scoring evaluation
Note this uses ZIP as the blocking key
Consider sorting the source table by the zip and cust_key for better performance
create or replace table samples as (select * from samples order by zip, cust_key);
*/
create or replace transient table candidate_pairs as (
select
t1.cust_key cust_key1
, t2.cust_key cust_key2
, t1.fname fname1
, t2.fname fname2
, t1.lname lname1
, t2.lname lname2
, t1.address address1
, t2.address address2
, t1.fname_std fname_std1
, t2.fname_std fname_std2
, t1.address_std address_std1
, t2.address_std address_std2
from samples t1
inner join samples t2 on t1.zip = t2.zip and t1.cust_key < t2.cust_key);
--flex the warehouse up, run the scoring then flex back down again
alter warehouse dev_wh set warehouse_size = 'XLARGE';
create or replace transient table scores as(
select
cust_key1 || '-' || cust_key2 pair_key
, cust_key1
, cust_key2
, fuzzy_score(fname1, fname2) fname_fuzzy
, fuzzy_score(soundex(fname1), soundex(fname2)) fname_soundex
, fuzzy_score(fname_std1, fname_std2) fname_std_fuzzy
, fuzzy_score(lname1, lname2) lname_fuzzy
, fuzzy_score(soundex(lname1), soundex(lname2)) lname_soundex
, fuzzy_score(address1, address2) address_fuzzy
, fuzzy_score(address_std1, address_std2) address_std_fuzzy
from candidate_pairs);
alter warehouse dev_wh set warehouse_size = 'SMALL';
-- Look at some records that are potential duplicates - play with the thresholds and other score values to see compared records in over/under comparison
with candidates as (
select
pair_key,
cust_key1,
cust_key2
from scores
where fname_soundex >= .75 and fname_fuzzy >= .75 and lname_fuzzy >= .75 and address_std_fuzzy >= .50
)
select
pair_key,
samples.*
from candidates
inner join samples on cust_key = cust_key1
union
select
pair_key,
samples.*
from candidates
inner join samples on cust_key = cust_key2
order by pair_key, cust_key
limit 100;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment