Instantly share code, notes, and snippets.

# joewiz/01-levenshtein-distance.xq Last active Jun 4, 2019

Calculate Levenshtein Distance, using XQuery
 xquery version "3.1"; (:~ Calculate Levenshtein Distance, using XQuery @author Guillaume Mella @see http://apps.jmmc.fr/~mellag//xquery/levenshtein/2018/01/19/xquery-levenshtein-distance/ :) declare function local:levenshtein-distance(\$string1 as xs:string?, \$string2 as xs:string?) as xs:integer { if ( fn:min( (fn:string-length(\$string1), fn:string-length(\$string2)) ) = 0 ) then fn:max((fn:string-length(\$string1), fn:string-length(\$string2))) else local:_levenshtein-distance( fn:string-to-codepoints(\$string1), fn:string-to-codepoints(\$string2), fn:string-length(\$string1), fn:string-length(\$string2), (1, 0, 1), 2 ) }; declare function local:_levenshtein-distance( \$chars1 as xs:integer*, \$chars2 as xs:integer*, \$length1 as xs:integer, \$length2 as xs:integer, \$lastDiag as xs:integer*, \$total as xs:integer) as xs:integer { let \$shift := if (\$total > \$length2) then (\$total - (\$length2 + 1)) else 0 let \$diag := for \$i in (fn:max((0, \$total - \$length2)) to fn:min((\$total, \$length1))) let \$j := \$total - \$i let \$d := (\$i - \$shift) * 2 return ( if (\$j lt \$length2) then \$lastDiag[\$d - 1] else () , if (\$i = 0) then \$j else if (\$j = 0) then \$i else fn:min((\$lastDiag[\$d - 1] + 1, \$lastDiag[\$d + 1] + 1, \$lastDiag[\$d] + (if (\$chars1[\$i] = \$chars2[\$j]) then 0 else 1) )) ) return if (\$total = \$length1 + \$length2) then fn:exactly-one(\$diag) else local:_levenshtein-distance(\$chars1, \$chars2, \$length1, \$length2, \$diag, \$total + 1) }; let \$input := ( " 2 Etablissement public pour l'aménagement de la région de La Défense" ," 3 Etablissement public pour l'aménagement de la région dite “de la Défense”" ," 29 Etablissement public pour l'aménagement de la région dite « de la Défense »" ," 1 Etablissement public pour l'aménagement de la région dite “de La Défense”" ," 2 groupement de coopération sanitaire « Centre régional de compétence en surdité infantile »" ," 1 groupement de coopération sanitaire « Centre régional de compétence en surdité infantile » (CRCSI)" ) let \$input := for \$e in \$input return substring(\$e,5) return
{for \$l in \$input return
1. {\$l}
2. }
{ for \$s at \$p in \$input return {\$p}
{\$p1}{local:levenshtein-distance(\$s1, \$s2)}
} {for \$s1 at \$p1 in \$input return { for \$s2 at \$p2 in \$input return if (\$p1>\$p2) then else } }
 xquery version "3.1"; (:~ Calculate Levenshtein Distance, using XQuery Adapted from Guillaume Mella's source by Joe Wicentowski - reformatted, refactored, refocused @author Guillaume Mella (original) @author Joe Wicentowski (revisions) @see http://apps.jmmc.fr/~mellag//xquery/levenshtein/2018/01/19/xquery-levenshtein-distance/ :) declare function local:levenshtein-distance( \$string1 as xs:string?, \$string2 as xs:string?) as xs:integer { if (min((string-length(\$string1), string-length(\$string2))) eq 0) then max((string-length(\$string1), string-length(\$string2))) else local:_levenshtein-distance( string-to-codepoints(\$string1), string-to-codepoints(\$string2), string-length(\$string1), string-length(\$string2), (1, 0, 1), 2 ) }; declare %private function local:_levenshtein-distance( \$chars1 as xs:integer*, \$chars2 as xs:integer*, \$length1 as xs:integer, \$length2 as xs:integer, \$lastDiag as xs:integer*, \$total as xs:integer) as xs:integer { let \$shift := if (\$total gt \$length2) then (\$total - (\$length2 + 1)) else 0 let \$diag := for \$i in (max((0, \$total - \$length2)) to min((\$total, \$length1))) let \$j := \$total - \$i let \$d := (\$i - \$shift) * 2 return ( if (\$j lt \$length2) then \$lastDiag[\$d - 1] else (), if (\$i eq 0) then \$j else if (\$j eq 0) then \$i else min(( \$lastDiag[\$d - 1] + 1, \$lastDiag[\$d + 1] + 1, \$lastDiag[\$d] + ( if (\$chars1[\$i] eq \$chars2[\$j]) then 0 else 1 ) )) ) return if (\$total eq \$length1 + \$length2) then \$diag else local:_levenshtein-distance(\$chars1, \$chars2, \$length1, \$length2, \$diag, \$total + 1) }; local:levenshtein-distance("gumbo", "gambol")