Created
September 16, 2015 17:26
-
-
Save sethc23/d58eed7418bc4b50fc6f to your computer and use it in GitHub Desktop.
f(x) z_str_comp_jaro
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
DROP FUNCTION IF EXISTS z_str_comp_jaro(text,text,boolean,boolean,boolean); | |
CREATE OR REPLACE FUNCTION z_str_comp_jaro(s1 text, | |
s2 text, | |
winklerize boolean default true, | |
long_tolerance boolean default true, | |
verbose boolean default false) | |
RETURNS double precision | |
AS $BODY$ | |
function round (n) | |
return math.floor((math.floor(n*2) + 1)/2) | |
end | |
function cjson_encode (tbl, verbose) | |
local res | |
if verbose then | |
local cjson = require "cjson" | |
res = cjson.encode(tbl) | |
else | |
res = " " | |
end | |
return res | |
end | |
function to_log (msg, verbose) | |
if verbose then log(msg) end | |
end | |
to_log( "NEW EXECUTION\\n\\n", verbose) | |
if #s1==0 or #s2==0 then | |
log( "s1 or s2 has no length!") | |
end | |
-- set #a>#b | |
local a,b,m = "","",0 | |
if #s1<#s2 then b,a = s1,s2 | |
else a,b = s1,s2 end | |
a,b = a:upper(),b:upper() | |
to_log( "a: "..a, verbose) | |
-- define max distance where character will be considered matching (despite tranposition) | |
local match_dist = round( (#a/2) - 1 ) | |
if match_dist<0 then match_dist=0 end | |
to_log( "match_dist="..match_dist, verbose) | |
-- create letter and flags tables | |
local a_tbl,b_tbl = {},{} | |
local a_flags,b_flags = {},{} | |
for i=1,#a do | |
table.insert( a_tbl,a:sub(i,i)) | |
table.insert( a_flags,false) | |
table.insert( b_tbl,b:sub(i,i)) | |
table.insert( b_flags,false) | |
end | |
for i=#a+1, #b do | |
table.insert( b_tbl,b:sub(i,i)) | |
table.insert( b_flags,false) | |
end | |
to_log( "a_tbl "..cjson_encode(a_tbl, verbose) , verbose) | |
to_log( "b_tbl "..cjson_encode(b_tbl, verbose) , verbose) | |
to_log( "b_tbl[3] "..b_tbl[3] , verbose) | |
-- verify tables are proper length | |
if (not #a==#a_tbl==#a_flags) or (not #b==#b_tbl==#b_flags) then | |
log( "issue with length of string/tbl/flags: "..#a.."/"..#a_tbl.."/"..#a_flags) | |
end | |
-- looking only within the match distance, count & flag matched pairs | |
local low,hi,common = 0,0,0 | |
local i | |
for _i,v in ipairs(a_tbl) do | |
i = _i-1 | |
local cursor = v | |
to_log( "cursor_1="..cursor, verbose) | |
if i>match_dist then | |
low = i-match_dist | |
else | |
low = 0 | |
end | |
if i+match_dist<=#b then | |
hi = i+match_dist | |
else | |
hi = #b | |
end | |
to_log( "low_hi "..low.." "..hi, verbose) | |
for _j=low+1, hi+1 do | |
j = _j-1 | |
to_log( "ij "..i.." "..j, verbose) | |
to_log( "cursor "..cursor, verbose) | |
to_log( "b_tbl[j+1] "..b_tbl[j+1], verbose) | |
if not b_flags[j+1] and b_tbl[j+1]==cursor then | |
to_log( "BREAK_HERE", verbose) | |
a_flags[i+1] = true | |
b_flags[j+1] = true | |
common = common+1 | |
break | |
end | |
end | |
end | |
to_log( "a_flags="..cjson_encode(a_flags, verbose) , verbose) | |
to_log( "b_flags="..cjson_encode(b_flags, verbose) , verbose) | |
-- return nil if no exact or transpositional matches | |
if common==0 then return nil end | |
to_log( "common = "..common, verbose) | |
-- count transpositions | |
local first,k,trans_count = true,1,0 | |
local _j | |
for _i,v in ipairs(a_tbl) do | |
i = _i - 1 | |
if a_flags[i+1] then | |
for j=k, #b do | |
_j = j - 1 | |
to_log( "i,j,_j= "..i..","..j..",".._j, verbose) | |
to_log( "b_flags[j]= "..cjson_encode({b_flags[j]}, verbose) , verbose) | |
if b_flags[j] then | |
k = j+1 | |
break | |
end | |
end | |
to_log( "k= "..k, verbose) | |
to_log( "a_tbl[i+1]= "..a_tbl[i+1], verbose) | |
if not j and first then | |
_j,first = 1,false | |
else | |
_j = _j + 1 | |
end | |
to_log( "b_tbl[_j]= "..b_tbl[_j], verbose) | |
if a_tbl[i+1]~=b_tbl[_j] then | |
if (not trans_count or trans_count==0) then | |
trans_count = 1 | |
else | |
trans_count = trans_count+1 | |
end | |
end | |
end | |
end | |
trans_count = trans_count/2 | |
to_log( "trans_count = "..trans_count, verbose) | |
-- adjust for similarities in nonmatched characters | |
local weight = 0 | |
weight = ( ( common/#a + common/#b + | |
(common-trans_count)/common ) )/3 | |
to_log( "weight = "..weight, verbose) | |
-- winkler modification: continue to boost if strings are similar | |
local i,_i,j = 0,0,0 | |
if winklerize and weight>0.7 and #a>3 and #b>3 then | |
-- adjust for up to first 4 chars in common | |
if #a<4 then j = #a | |
else j = 4 end | |
to_log( "i,j_1= "..i..","..j, verbose) | |
for _i=1, j-1 do | |
if _i==1 then i = _i-1 end | |
if a_tbl[_i]==b_tbl[_i] and #b>=_i then | |
if not i then i = 1 | |
else i = i+1 end | |
to_log( "i,_i,j_2= "..i..",".._i..","..j, verbose) | |
end | |
if i>j then break end | |
end | |
to_log( "i,_i,j_3= "..i..",".._i..","..j, verbose) | |
if i-1>0 then | |
i = i-1 | |
weight = weight + ( i * 0.1 * (1.0 - weight) ) | |
end | |
to_log( "new weight_1 = "..weight, verbose) | |
-- optionally adjust for long strings | |
-- after agreeing beginning chars, at least two or more must agree and | |
-- agreed characters must be > half of remaining characters | |
if ( long_tolerance and | |
#a>4 and | |
common>i+1 and | |
2*common>=#a+i ) then | |
weight = weight + ((1.0 - weight) * ( (common-i-1) / (#a+#b-i*2+2))) | |
end | |
to_log( "new weight_2 = "..weight, verbose) | |
end | |
return weight | |
$BODY$ LANGUAGE plluau; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment