Skip to content

Instantly share code, notes, and snippets.

@joewiz
Last active October 19, 2017 14:54
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save joewiz/0c9728a064c79be4f6af9209eef9516c to your computer and use it in GitHub Desktop.
Save joewiz/0c9728a064c79be4f6af9209eef9516c to your computer and use it in GitHub Desktop.
List abbreviations in a text, with XQuery 3.1
xquery version "3.1";
(: Force oXygen to indent the output :)
declare namespace output="http://www.w3.org/2010/xslt-xquery-serialization";
declare option output:method "xml";
declare option output:indent "yes";
(: Assumed context: you're running this in oXygen's XPath/XQuery Builder pane
: with an XML document open in the main editor :)
let $text := //text() => string-join(" ") => normalize-space()
(: Look for acronyms: Two or more upper case letters, which may be delimted
: by periods or slashes; allow plural and possessive forms through, e.g.,
: COMs, USIA’s :)
let $analysis := analyze-string($text, "([A-Z][\./]?){2,}(’?s)?")
let $hits := $analysis//fn:match
return
<abbreviations>{
for $hit in $hits
(: For grouping purposes, ignore trailing period if it's not a
: period-delimited acronym, e.g., treat "USSR" and "USSR." as
: identical but treat "U.S.S.R." and "U.S.S.R" as different.
: Similarly, treat "USIA", "USIA’s" as the same :)
group by $str :=
(
if (matches($hit, '^(\w\.){2,}')) then
$hit
else
replace($hit, '\.$', '')
) !
replace(., "’?s$", "")
let $count := count($hit)
order by
if (contains($str, '.')) then
concat(replace($str, '\.', ''), '.')
else
$str
return
<abbr count="{$count}">{$str[1]}</abbr>
}</abbreviations>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment