Skip to content

Instantly share code, notes, and snippets.

@sethc23
Created September 16, 2015 17:26
Show Gist options
  • Save sethc23/d58eed7418bc4b50fc6f to your computer and use it in GitHub Desktop.
Save sethc23/d58eed7418bc4b50fc6f to your computer and use it in GitHub Desktop.
f(x) z_str_comp_jaro
DROP FUNCTION IF EXISTS z_str_comp_jaro(text,text,boolean,boolean,boolean);
CREATE OR REPLACE FUNCTION z_str_comp_jaro(s1 text,
s2 text,
winklerize boolean default true,
long_tolerance boolean default true,
verbose boolean default false)
RETURNS double precision
AS $BODY$
function round (n)
return math.floor((math.floor(n*2) + 1)/2)
end
function cjson_encode (tbl, verbose)
local res
if verbose then
local cjson = require "cjson"
res = cjson.encode(tbl)
else
res = " "
end
return res
end
function to_log (msg, verbose)
if verbose then log(msg) end
end
to_log( "NEW EXECUTION\\n\\n", verbose)
if #s1==0 or #s2==0 then
log( "s1 or s2 has no length!")
end
-- set #a>#b
local a,b,m = "","",0
if #s1<#s2 then b,a = s1,s2
else a,b = s1,s2 end
a,b = a:upper(),b:upper()
to_log( "a: "..a, verbose)
-- define max distance where character will be considered matching (despite tranposition)
local match_dist = round( (#a/2) - 1 )
if match_dist<0 then match_dist=0 end
to_log( "match_dist="..match_dist, verbose)
-- create letter and flags tables
local a_tbl,b_tbl = {},{}
local a_flags,b_flags = {},{}
for i=1,#a do
table.insert( a_tbl,a:sub(i,i))
table.insert( a_flags,false)
table.insert( b_tbl,b:sub(i,i))
table.insert( b_flags,false)
end
for i=#a+1, #b do
table.insert( b_tbl,b:sub(i,i))
table.insert( b_flags,false)
end
to_log( "a_tbl "..cjson_encode(a_tbl, verbose) , verbose)
to_log( "b_tbl "..cjson_encode(b_tbl, verbose) , verbose)
to_log( "b_tbl[3] "..b_tbl[3] , verbose)
-- verify tables are proper length
if (not #a==#a_tbl==#a_flags) or (not #b==#b_tbl==#b_flags) then
log( "issue with length of string/tbl/flags: "..#a.."/"..#a_tbl.."/"..#a_flags)
end
-- looking only within the match distance, count & flag matched pairs
local low,hi,common = 0,0,0
local i
for _i,v in ipairs(a_tbl) do
i = _i-1
local cursor = v
to_log( "cursor_1="..cursor, verbose)
if i>match_dist then
low = i-match_dist
else
low = 0
end
if i+match_dist<=#b then
hi = i+match_dist
else
hi = #b
end
to_log( "low_hi "..low.." "..hi, verbose)
for _j=low+1, hi+1 do
j = _j-1
to_log( "ij "..i.." "..j, verbose)
to_log( "cursor "..cursor, verbose)
to_log( "b_tbl[j+1] "..b_tbl[j+1], verbose)
if not b_flags[j+1] and b_tbl[j+1]==cursor then
to_log( "BREAK_HERE", verbose)
a_flags[i+1] = true
b_flags[j+1] = true
common = common+1
break
end
end
end
to_log( "a_flags="..cjson_encode(a_flags, verbose) , verbose)
to_log( "b_flags="..cjson_encode(b_flags, verbose) , verbose)
-- return nil if no exact or transpositional matches
if common==0 then return nil end
to_log( "common = "..common, verbose)
-- count transpositions
local first,k,trans_count = true,1,0
local _j
for _i,v in ipairs(a_tbl) do
i = _i - 1
if a_flags[i+1] then
for j=k, #b do
_j = j - 1
to_log( "i,j,_j= "..i..","..j..",".._j, verbose)
to_log( "b_flags[j]= "..cjson_encode({b_flags[j]}, verbose) , verbose)
if b_flags[j] then
k = j+1
break
end
end
to_log( "k= "..k, verbose)
to_log( "a_tbl[i+1]= "..a_tbl[i+1], verbose)
if not j and first then
_j,first = 1,false
else
_j = _j + 1
end
to_log( "b_tbl[_j]= "..b_tbl[_j], verbose)
if a_tbl[i+1]~=b_tbl[_j] then
if (not trans_count or trans_count==0) then
trans_count = 1
else
trans_count = trans_count+1
end
end
end
end
trans_count = trans_count/2
to_log( "trans_count = "..trans_count, verbose)
-- adjust for similarities in nonmatched characters
local weight = 0
weight = ( ( common/#a + common/#b +
(common-trans_count)/common ) )/3
to_log( "weight = "..weight, verbose)
-- winkler modification: continue to boost if strings are similar
local i,_i,j = 0,0,0
if winklerize and weight>0.7 and #a>3 and #b>3 then
-- adjust for up to first 4 chars in common
if #a<4 then j = #a
else j = 4 end
to_log( "i,j_1= "..i..","..j, verbose)
for _i=1, j-1 do
if _i==1 then i = _i-1 end
if a_tbl[_i]==b_tbl[_i] and #b>=_i then
if not i then i = 1
else i = i+1 end
to_log( "i,_i,j_2= "..i..",".._i..","..j, verbose)
end
if i>j then break end
end
to_log( "i,_i,j_3= "..i..",".._i..","..j, verbose)
if i-1>0 then
i = i-1
weight = weight + ( i * 0.1 * (1.0 - weight) )
end
to_log( "new weight_1 = "..weight, verbose)
-- optionally adjust for long strings
-- after agreeing beginning chars, at least two or more must agree and
-- agreed characters must be > half of remaining characters
if ( long_tolerance and
#a>4 and
common>i+1 and
2*common>=#a+i ) then
weight = weight + ((1.0 - weight) * ( (common-i-1) / (#a+#b-i*2+2)))
end
to_log( "new weight_2 = "..weight, verbose)
end
return weight
$BODY$ LANGUAGE plluau;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment