Skip to content

Instantly share code, notes, and snippets.

@joewiz
Last active July 5, 2022 17:18
Show Gist options
  • Star 3 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save joewiz/b8b2f29c7aa4a08ee99a6b6a0b3311d3 to your computer and use it in GitHub Desktop.
Save joewiz/b8b2f29c7aa4a08ee99a6b6a0b3311d3 to your computer and use it in GitHub Desktop.
Calculate Levenshtein Distance, using XQuery
xquery version "3.1";
(:~
Calculate Levenshtein Distance, using XQuery
@author Guillaume Mella
@see http://apps.jmmc.fr/~mellag/xquery/levenshtein/2018/01/19/xquery-levenshtein-distance/
:)
declare function local:levenshtein-distance($string1 as xs:string?, $string2 as xs:string?)
as xs:integer
{
if ( fn:min( (fn:string-length($string1), fn:string-length($string2)) ) = 0 )
then
fn:max((fn:string-length($string1), fn:string-length($string2)))
else
local:_levenshtein-distance(
fn:string-to-codepoints($string1),
fn:string-to-codepoints($string2),
fn:string-length($string1),
fn:string-length($string2),
(1, 0, 1),
2
)
};
declare function local:_levenshtein-distance(
$chars1 as xs:integer*,
$chars2 as xs:integer*,
$length1 as xs:integer,
$length2 as xs:integer,
$lastDiag as xs:integer*,
$total as xs:integer)
as xs:integer
{
let $shift := if ($total > $length2) then ($total - ($length2 + 1)) else 0
let $diag :=
for $i in (fn:max((0, $total - $length2)) to fn:min(($total, $length1)))
let $j := $total - $i let $d := ($i - $shift) * 2
return (
if ($j lt $length2) then
$lastDiag[$d - 1]
else () ,
if ($i = 0) then $j
else if ($j = 0) then $i
else fn:min(($lastDiag[$d - 1] + 1,
$lastDiag[$d + 1] + 1,
$lastDiag[$d] + (if ($chars1[$i] = $chars2[$j]) then 0 else 1)
))
)
return
if ($total = $length1 + $length2) then fn:exactly-one($diag)
else local:_levenshtein-distance($chars1, $chars2, $length1, $length2, $diag, $total + 1)
};
let $input := (
" 2 Etablissement public pour l'aménagement de la région de La Défense"
," 3 Etablissement public pour l'aménagement de la région dite “de la Défense”"
," 29 Etablissement public pour l'aménagement de la région dite « de la Défense »"
," 1 Etablissement public pour l'aménagement de la région dite “de La Défense”"
," 2 groupement de coopération sanitaire « Centre régional de compétence en surdité infantile »"
," 1 groupement de coopération sanitaire « Centre régional de compétence en surdité infantile » (CRCSI)"
)
let $input := for $e in $input return substring($e,5)
return
<div>
<ol>{for $l in $input return <li>{$l}</li>}</ol>
<table>
<tr><td/>{
for $s at $p in $input return <td>{$p}</td>
}</tr>
{for $s1 at $p1 in $input
return <tr>
<td>{$p1}</td>
{
for $s2 at $p2 in $input
return if ($p1>$p2) then <td></td> else <td>{local:levenshtein-distance($s1, $s2)}</td>
}
</tr>
}</table>
</div>
xquery version "3.1";
(:~
Calculate Levenshtein Distance, using XQuery
Adapted from Guillaume Mella's source by Joe Wicentowski - reformatted, refactored, refocused
@author Guillaume Mella (original)
@author Joe Wicentowski (revisions)
@see http://apps.jmmc.fr/~mellag/xquery/levenshtein/2018/01/19/xquery-levenshtein-distance/
:)
declare function local:levenshtein-distance(
$string1 as xs:string?,
$string2 as xs:string?)
as xs:integer {
if (min((string-length($string1), string-length($string2))) eq 0) then
max((string-length($string1), string-length($string2)))
else
local:_levenshtein-distance(
string-to-codepoints($string1),
string-to-codepoints($string2),
string-length($string1),
string-length($string2),
(1, 0, 1),
2
)
};
declare %private function local:_levenshtein-distance(
$chars1 as xs:integer*,
$chars2 as xs:integer*,
$length1 as xs:integer,
$length2 as xs:integer,
$lastDiag as xs:integer*,
$total as xs:integer)
as xs:integer {
let $shift :=
if ($total gt $length2) then
($total - ($length2 + 1))
else
0
let $diag :=
for $i in (max((0, $total - $length2)) to min(($total, $length1)))
let $j := $total - $i
let $d := ($i - $shift) * 2
return (
if ($j lt $length2) then
$lastDiag[$d - 1]
else
(),
if ($i eq 0) then
$j
else if ($j eq 0) then
$i
else
min((
$lastDiag[$d - 1] + 1,
$lastDiag[$d + 1] + 1,
$lastDiag[$d] + (
if ($chars1[$i] eq $chars2[$j]) then
0
else
1
)
))
)
return
if ($total eq $length1 + $length2) then
$diag
else
local:_levenshtein-distance($chars1, $chars2, $length1, $length2, $diag, $total + 1)
};
local:levenshtein-distance("gumbo", "gambol")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment