Mark-McCracken/nysiis.sql

## nysiis.sql
CREATE OR REPLACE FUNCTION `functions.fuzzy_match_nysiis`(input STRING) AS ((
select
  string_agg(c, "" order by off asc)
from (select
        c,
        off,
        coalesce(lag(c) over (order by off) = c, false) as same_as_prev_char
      from unnest(split((select
              regexp_replace(regexp_replace(regexp_replace(string_agg(new_c, "" order by off), r"s+$", ""), "ay$", "y"), "a+$", "") as step_7
            from (select
                    c,
                    if(off = 0, c,
                    case
                      when off != 0 and c = 'e' and next_c = 'v' then 'a'
                      when off != 1 and c = 'v' and prev_c = 'e' then 'f'

                      when c in ('a', 'e', 'i', 'o', 'u') then 'a'

                      when c = 'q' then 'g'
                      when c = 'z' then 's'
                      when c = 'm' then 'n'

                      when c = 'k' and next_c = 'n' then 'n'
                      when off != 1 and prev_c = 'k' and c = 'n' then null
                      when c = 'k' then 'c'

                      when c = 's' and next_c = 'c' and next2_c = 'h' then 's'
                      when off != 1 and prev_c = 's' and c = 'c' and next_c = 'h' then 's'
                      when off != 2 and prev2_c = 's' and prev_c = 'c' and c = 'h' then 's'

                      when c = 'p' and next_c = 'h' then 'f'
                      when off != 1 and prev_c = 'p' and c = 'h' then 'f'

                      when off != 1 and (c = 'h' and (prev_c not in ('a', 'e', 'i', 'o', 'u') or next_c not in ('a', 'e', 'i', 'o', 'u'))) then prev_c
                      when off != 1 and c = 'w' and prev_c in ('a', 'e', 'i', 'o', 'u') then prev_c
                      else c
                    end) as new_c,
                    off
                  from (select
                          c,
                          off,
                          lag(c) over (order by off) as prev_c,
                          lag(c, 2) over (order by off) as prev2_c,
                          lead(c) over (order by off) as next_c,
                          lead(c, 2) over (order by off) as next2_c
                        from unnest(split(regexp_replace(regexp_replace(regexp_replace(
                                          regexp_replace(regexp_replace(regexp_replace(
                                          regexp_replace(lower(input), "^mac", "mcc"),
                                                          "^kn", "nn"),
                                                          "^k", "c"),
                                                          "^p(h|f)", "ff"),
                                                          "^sch", "sss"),
                                                          "(e|i)e$", "y"),
                                                          "(dt|rt|rd|nt|nd)$", "d")
                                      , "")) as c with offset off -- starts at 0
                        )
                  )
            where new_c is not null
            ), "")) as c with offset off
      )
where ((not same_as_prev_char) or (same_as_prev_char and off = 1))
));
	CREATE OR REPLACE FUNCTION `functions.fuzzy_match_nysiis`(input STRING) AS ((
	select
	string_agg(c, "" order by off asc)
	from (select
	c,
	off,
	coalesce(lag(c) over (order by off) = c, false) as same_as_prev_char
	from unnest(split((select
	regexp_replace(regexp_replace(regexp_replace(string_agg(new_c, "" order by off), r"s+$", ""), "ay$", "y"), "a+$", "") as step_7
	from (select
	c,
	if(off = 0, c,
	case
	when off != 0 and c = 'e' and next_c = 'v' then 'a'
	when off != 1 and c = 'v' and prev_c = 'e' then 'f'

	when c in ('a', 'e', 'i', 'o', 'u') then 'a'

	when c = 'q' then 'g'
	when c = 'z' then 's'
	when c = 'm' then 'n'

	when c = 'k' and next_c = 'n' then 'n'
	when off != 1 and prev_c = 'k' and c = 'n' then null
	when c = 'k' then 'c'

	when c = 's' and next_c = 'c' and next2_c = 'h' then 's'
	when off != 1 and prev_c = 's' and c = 'c' and next_c = 'h' then 's'
	when off != 2 and prev2_c = 's' and prev_c = 'c' and c = 'h' then 's'

	when c = 'p' and next_c = 'h' then 'f'
	when off != 1 and prev_c = 'p' and c = 'h' then 'f'

	when off != 1 and (c = 'h' and (prev_c not in ('a', 'e', 'i', 'o', 'u') or next_c not in ('a', 'e', 'i', 'o', 'u'))) then prev_c
	when off != 1 and c = 'w' and prev_c in ('a', 'e', 'i', 'o', 'u') then prev_c
	else c
	end) as new_c,
	off
	from (select
	c,
	off,
	lag(c) over (order by off) as prev_c,
	lag(c, 2) over (order by off) as prev2_c,
	lead(c) over (order by off) as next_c,
	lead(c, 2) over (order by off) as next2_c
	from unnest(split(regexp_replace(regexp_replace(regexp_replace(
	regexp_replace(regexp_replace(regexp_replace(
	regexp_replace(lower(input), "^mac", "mcc"),
	"^kn", "nn"),
	"^k", "c"),
	"^p(h\|f)", "ff"),
	"^sch", "sss"),
	"(e\|i)e$", "y"),
	"(dt\|rt\|rd\|nt\|nd)$", "d")
	, "")) as c with offset off -- starts at 0
	)
	)
	where new_c is not null
	), "")) as c with offset off
	)
	where ((not same_as_prev_char) or (same_as_prev_char and off = 1))
	));