Skip to content

Instantly share code, notes, and snippets.

@timathom
Last active February 27, 2024 04:59
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save timathom/c3f2899cab3de4c4f9dff3f14c2e8490 to your computer and use it in GitHub Desktop.
Save timathom/c3f2899cab3de4c4f9dff3f14c2e8490 to your computer and use it in GitHub Desktop.
Jaro-Winkler similarity in XQuery 4.0
xquery version "4.0";
(:~
:
: Module Name: Jaro-Winkler String Similarity
: Date: February 26, 2024
: License: GPLv3
: XQuery specification: 4.0
: Dependencies: BaseX 11.0
: @author @timathom
:
: Revisions by @ChristianGruen and @joewiz
:
:)
declare namespace ec = "__entity-checker__";
declare function ec:jaro-winkler(
$string1 as xs:string+,
$string2 as xs:string+
) as xs:double {
if (normalize-space($string1) and normalize-space($string2)) then (
let $sorted := sort(($string1, $string2), (), fn{string-length()})
let $s1 := characters($sorted[1])
let $s2 := characters($sorted[2])
let $s1-len := count($s1)
let $s2-len := count($s2)
let $len := max(($s1-len, $s2-len))
(: Maximum distance for matching characters. :)
let $distance := max(($s1-len, $s2-len)) idiv 2 - 1
(: Find matching characters. :)
let $matches := ec:find-matches($s1, $s2, 1, $distance, $len, ())
let $strings := $matches[. instance of xs:string]
let $m := count($strings)
return if ($m > 0) then (
(: Get the indices of matched characters in each string. :)
let $indices := fn($s) {
for $match at $p in $strings
let $curr := index-of($s, $match)
return head(
$curr[. ge $p]
[(items-at($matches, .) instance of xs:string) or (items-at($matches, . - 1) instance of xs:string)]
) otherwise $curr
}
let $s1-indices := $indices($s1)
let $s2-indices := $indices($s2)
(: Check for characters that matched but are transposed. :)
let $t := sum(
for $_1 at $p1 in $s1[$s1-indices]
for $_2 at $p2 in $s2[$s2-indices]
where $p1 = $p2 and $_1 != $_2
return 1
) div 2
(: Jaro formula. :)
let $j := ($m div $s1-len + $m div $s2-len + ($m - $t) div $m) div 3
(: Winkler prefix. :)
let $sub-s1 := subsequence($s1, 1, 4)
let $sub-s2 := subsequence($s2, 1, 4)
let $l := while-do(1, fn($n) { $sub-s1[$n] = $sub-s2[$n] }, fn($n) { $n + 1 }) - 1
return $j + 0.1e0 * $l * (1 - $j)
) else 0
) else 0
};
declare function ec:find-matches(
$s1 as xs:string*,
$s2 as xs:string*,
$i as xs:integer,
$distance as xs:integer,
$len as xs:integer,
$matches as item()*
) as item()* {
if ($i > $len) then $matches else (
(: Set the distance window. :)
let $start := max((1, $i - $distance))
let $end := min(($len, $i + $distance))
(: Get the first match. :)
let $match := head(subsequence($s2, $start, $end)[. = $s1[$i]])
(: Recurse. :)
return ec:find-matches($s1, $s2, $i + 1, $distance, $len, ($matches, $match otherwise 0))
)
};
(: Sample strings. :)
let $strings := [
["FAREMVIEL", "FARMVILLE", 0.9189814815],
["Elon Musk", "Colon Muskratty", 0.7657407407],
["xabcdxxxxxx", " ydyaybycyyyyyy", 0.3767676768],
["Exlon", "Colon", 0.7333333333],
["MARTHA", "MARHTA", 0.9611111111],
["bukhari muhammad ismat allah ibn mahmud", "bukhari muhammad ismat allah ibn mahmud active 16th century", 0.9322033898],
["andrews duncan 1935-2011", "andrews duncan", 0.9166666667],
["DWAYNE", "DUANE", 0.8400000000],
["FLWOR Found.", "FLWOR Foundation", 0.9208333333],
["My Gym Children's Fitness Center", "My Gym. Childrens Fitness", .9420000000],
["DIXON", "DICKSONX", 0.8133333333],
["JELLYFISH", "SMELLYFISH", 0.8962962963]
]
(: Test results and compare to Levenshtein (BaseX implementation). :)
for member $string in $strings
return
(: ec:jaro-winkler($string?1, $string?2) :)
string-join(
(`Comparing ``{$string?1}`` to ``{$string?2}``.`,
(: `The Levenshtein similarity is {string:levenshtein($string?1, $string?2)}.`, :)
`The Jaro-Winkler similarity is {ec:jaro-winkler($string?1, $string?2)}.`,
`Expected {$string?3}`||"
")
, "
"
)
@timathom
Copy link
Author

timathom commented Feb 26, 2024

Fixed bug causing truncated comparisons.

@timathom
Copy link
Author

Fixed prefix variables.

@timathom
Copy link
Author

Added fix for trailing space.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment