Skip to content

Instantly share code, notes, and snippets.

@wsalesky
Forked from joewiz/strip-diacritics.xq
Last active October 4, 2017 15:27
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save wsalesky/8cba7ee82f548308e29ea3c2f278a944 to your computer and use it in GitHub Desktop.
Save wsalesky/8cba7ee82f548308e29ea3c2f278a944 to your computer and use it in GitHub Desktop.
Strip diacritics, with XQuery
xquery version "3.1";
declare function local:strip-diacritics($string as xs:string) as xs:string {
let $normalized := normalize-unicode($string, 'NFD')
let $stripped := replace($normalized, '\p{M}', '')
return
$stripped
};
declare function local:inspect-diacritics($string as xs:string) as element() {
let $normalized := normalize-unicode($string, 'NFD')
let $stripped := local:strip-diacritics($string)
return
<result>
<source>{$string}</source>
<source-is-nfc-normalized>{$string = normalize-unicode($string)}</source-is-nfc-normalized>
<nfd-normalized>{$normalized}</nfd-normalized>
<stripped-of-combining-diacritical-marks>{$stripped}</stripped-of-combining-diacritical-marks>
<src-codepoints>{string-to-codepoints($string)}</src-codepoints>
<nfd-codepoints>{string-to-codepoints($normalized)}</nfd-codepoints>
<fin-codepoints>{string-to-codepoints($stripped)}</fin-codepoints>
</result>
};
let $string1-western := 'ܢܰܦܫܳܐ'
let $string1 := 'ܢܦܫܐ'
let $string1-eastern := 'ܢܲܦܫܵܐ'
let $string2 := 'ܦܘܪܫܢܐ'
let $string2-western := 'ܦܘܽܪܫܳܢܳܐ'
let $string2-eastern := 'ܦܘܼܪܫܵܢܵܐ'
return
(
local:strip-diacritics($string1) = local:strip-diacritics($string1-eastern),
local:strip-diacritics($string1) = local:strip-diacritics($string1-western),
local:strip-diacritics($string2) = local:strip-diacritics($string2-eastern),
local:strip-diacritics($string2) = local:strip-diacritics($string2-western)
)
<result>
<source>çéüå</source>
<source-is-nfc-normalized>true</source-is-nfc-normalized>
<nfd-normalized>çéüå</nfd-normalized>
<stripped-of-combining-diacritical-marks>ceua</stripped-of-combining-diacritical-marks>
<src-codepoints>231 233 252 229</src-codepoints>
<nfd-codepoints>99 807 101 769 117 776 97 778</nfd-codepoints>
<fin-codepoints>99 101 117 97</fin-codepoints>
</result>
@wsalesky
Copy link
Author

wsalesky commented Oct 4, 2017

Updated to test Syriac with and without vowels and other character marks.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment