Skip to content

Instantly share code, notes, and snippets.

@Mark-McCracken
Created November 3, 2019 15:40
Show Gist options
  • Save Mark-McCracken/f139426faa4e67c10c41ae957da1fa41 to your computer and use it in GitHub Desktop.
Save Mark-McCracken/f139426faa4e67c10c41ae957da1fa41 to your computer and use it in GitHub Desktop.
BigQuery string fuzzy matching nysiis
CREATE OR REPLACE FUNCTION `functions.fuzzy_match_nysiis`(input STRING) AS ((
select
string_agg(c, "" order by off asc)
from (select
c,
off,
coalesce(lag(c) over (order by off) = c, false) as same_as_prev_char
from unnest(split((select
regexp_replace(regexp_replace(regexp_replace(string_agg(new_c, "" order by off), r"s+$", ""), "ay$", "y"), "a+$", "") as step_7
from (select
c,
if(off = 0, c,
case
when off != 0 and c = 'e' and next_c = 'v' then 'a'
when off != 1 and c = 'v' and prev_c = 'e' then 'f'
when c in ('a', 'e', 'i', 'o', 'u') then 'a'
when c = 'q' then 'g'
when c = 'z' then 's'
when c = 'm' then 'n'
when c = 'k' and next_c = 'n' then 'n'
when off != 1 and prev_c = 'k' and c = 'n' then null
when c = 'k' then 'c'
when c = 's' and next_c = 'c' and next2_c = 'h' then 's'
when off != 1 and prev_c = 's' and c = 'c' and next_c = 'h' then 's'
when off != 2 and prev2_c = 's' and prev_c = 'c' and c = 'h' then 's'
when c = 'p' and next_c = 'h' then 'f'
when off != 1 and prev_c = 'p' and c = 'h' then 'f'
when off != 1 and (c = 'h' and (prev_c not in ('a', 'e', 'i', 'o', 'u') or next_c not in ('a', 'e', 'i', 'o', 'u'))) then prev_c
when off != 1 and c = 'w' and prev_c in ('a', 'e', 'i', 'o', 'u') then prev_c
else c
end) as new_c,
off
from (select
c,
off,
lag(c) over (order by off) as prev_c,
lag(c, 2) over (order by off) as prev2_c,
lead(c) over (order by off) as next_c,
lead(c, 2) over (order by off) as next2_c
from unnest(split(regexp_replace(regexp_replace(regexp_replace(
regexp_replace(regexp_replace(regexp_replace(
regexp_replace(lower(input), "^mac", "mcc"),
"^kn", "nn"),
"^k", "c"),
"^p(h|f)", "ff"),
"^sch", "sss"),
"(e|i)e$", "y"),
"(dt|rt|rd|nt|nd)$", "d")
, "")) as c with offset off -- starts at 0
)
)
where new_c is not null
), "")) as c with offset off
)
where ((not same_as_prev_char) or (same_as_prev_char and off = 1))
));
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment