Last active
October 19, 2017 14:54
-
-
Save joewiz/0c9728a064c79be4f6af9209eef9516c to your computer and use it in GitHub Desktop.
List abbreviations in a text, with XQuery 3.1
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
xquery version "3.1"; | |
(: Force oXygen to indent the output :) | |
declare namespace output="http://www.w3.org/2010/xslt-xquery-serialization"; | |
declare option output:method "xml"; | |
declare option output:indent "yes"; | |
(: Assumed context: you're running this in oXygen's XPath/XQuery Builder pane | |
: with an XML document open in the main editor :) | |
let $text := //text() => string-join(" ") => normalize-space() | |
(: Look for acronyms: Two or more upper case letters, which may be delimted | |
: by periods or slashes; allow plural and possessive forms through, e.g., | |
: COMs, USIA’s :) | |
let $analysis := analyze-string($text, "([A-Z][\./]?){2,}(’?s)?") | |
let $hits := $analysis//fn:match | |
return | |
<abbreviations>{ | |
for $hit in $hits | |
(: For grouping purposes, ignore trailing period if it's not a | |
: period-delimited acronym, e.g., treat "USSR" and "USSR." as | |
: identical but treat "U.S.S.R." and "U.S.S.R" as different. | |
: Similarly, treat "USIA", "USIA’s" as the same :) | |
group by $str := | |
( | |
if (matches($hit, '^(\w\.){2,}')) then | |
$hit | |
else | |
replace($hit, '\.$', '') | |
) ! | |
replace(., "’?s$", "") | |
let $count := count($hit) | |
order by | |
if (contains($str, '.')) then | |
concat(replace($str, '\.', ''), '.') | |
else | |
$str | |
return | |
<abbr count="{$count}">{$str[1]}</abbr> | |
}</abbreviations> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment