Created
November 3, 2019 15:40
-
-
Save Mark-McCracken/f139426faa4e67c10c41ae957da1fa41 to your computer and use it in GitHub Desktop.
BigQuery string fuzzy matching nysiis
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
CREATE OR REPLACE FUNCTION `functions.fuzzy_match_nysiis`(input STRING) AS (( | |
select | |
string_agg(c, "" order by off asc) | |
from (select | |
c, | |
off, | |
coalesce(lag(c) over (order by off) = c, false) as same_as_prev_char | |
from unnest(split((select | |
regexp_replace(regexp_replace(regexp_replace(string_agg(new_c, "" order by off), r"s+$", ""), "ay$", "y"), "a+$", "") as step_7 | |
from (select | |
c, | |
if(off = 0, c, | |
case | |
when off != 0 and c = 'e' and next_c = 'v' then 'a' | |
when off != 1 and c = 'v' and prev_c = 'e' then 'f' | |
when c in ('a', 'e', 'i', 'o', 'u') then 'a' | |
when c = 'q' then 'g' | |
when c = 'z' then 's' | |
when c = 'm' then 'n' | |
when c = 'k' and next_c = 'n' then 'n' | |
when off != 1 and prev_c = 'k' and c = 'n' then null | |
when c = 'k' then 'c' | |
when c = 's' and next_c = 'c' and next2_c = 'h' then 's' | |
when off != 1 and prev_c = 's' and c = 'c' and next_c = 'h' then 's' | |
when off != 2 and prev2_c = 's' and prev_c = 'c' and c = 'h' then 's' | |
when c = 'p' and next_c = 'h' then 'f' | |
when off != 1 and prev_c = 'p' and c = 'h' then 'f' | |
when off != 1 and (c = 'h' and (prev_c not in ('a', 'e', 'i', 'o', 'u') or next_c not in ('a', 'e', 'i', 'o', 'u'))) then prev_c | |
when off != 1 and c = 'w' and prev_c in ('a', 'e', 'i', 'o', 'u') then prev_c | |
else c | |
end) as new_c, | |
off | |
from (select | |
c, | |
off, | |
lag(c) over (order by off) as prev_c, | |
lag(c, 2) over (order by off) as prev2_c, | |
lead(c) over (order by off) as next_c, | |
lead(c, 2) over (order by off) as next2_c | |
from unnest(split(regexp_replace(regexp_replace(regexp_replace( | |
regexp_replace(regexp_replace(regexp_replace( | |
regexp_replace(lower(input), "^mac", "mcc"), | |
"^kn", "nn"), | |
"^k", "c"), | |
"^p(h|f)", "ff"), | |
"^sch", "sss"), | |
"(e|i)e$", "y"), | |
"(dt|rt|rd|nt|nd)$", "d") | |
, "")) as c with offset off -- starts at 0 | |
) | |
) | |
where new_c is not null | |
), "")) as c with offset off | |
) | |
where ((not same_as_prev_char) or (same_as_prev_char and off = 1)) | |
)); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment