Skip to content

Instantly share code, notes, and snippets.

Last active April 20, 2022 11:23
Show Gist options
  • Save joewiz/0a7e1f9b69169fc907fb482d28f24947 to your computer and use it in GitHub Desktop.
Save joewiz/0a7e1f9b69169fc907fb482d28f24947 to your computer and use it in GitHub Desktop.
Split (or "tokenize") a string into "sentences", with XQuery. See
xquery version "3.1";
(: Use the eXist Stanford NLP package for sentence tokenization.
: Compared to my original "naïve" approach, this approach takes a quarter the number of lines of XQuery code.
: See :)
import module namespace nlp="";
declare function local:tokenize-sentences($text as xs:string) {
local:tokenize-sentences($text, map{})
declare function local:tokenize-sentences($text as xs:string, $options as map(*)) {
let $default-options := map { "annotators": [ "tokenize", "ssplit" ] }
let $opts := map:merge(($options, $default-options))
for $sentence in nlp:parse($text, $opts)//sentence
for $token in $sentence/tokens/token
let $start-offset := $token/CharacterOffsetBegin cast as xs:integer + 1
let $end-offset :=
let $next-token := $token/following-sibling::token[1]
if ($next-token) then
$next-token/CharacterOffsetBegin cast as xs:integer
$token/CharacterOffsetEnd cast as xs:integer
let $length := $end-offset - $start-offset + 1
substring($text, $start-offset, $length)
) => normalize-space()
(: sample text taken from :)
let $source-text :=
'154613. You should arrange to deliver following note to North Vietnamese Embassy.
If in your opinion it can be done without creating an issue, we would prefer that
you ask North Vietnamese Charge to come to your Embassy to receive note. “The U.S.
Government agrees with the statement of the Government of the DRV, in its note of
April 27, that it is necessary for Hanoi and Washington to engage in conversations
promptly. The U.S. Government notes that the DRV has now agreed that representatives
of the two countries should hold private discussions for the sole purpose of
agreeing on a location and date. The U.S. Government notes that the DRV did not
respond to its suggestion of April 23 that we meet for this limited purpose in a
‘capital not previously considered by either side.’ The U.S. Government suggested
the DRV might wish to indicate three appropriate locations suitable for this limited
purpose. The U.S. Government does not consider that the suggestion of Warsaw is
responsive or acceptable. The U.S. Government is prepared for these limited discussions
on April 30 or several days thereafter. The U.S. Government would welcome the prompt
response of the DRV to this suggestion.”'
let $sentences := local:tokenize-sentences($source-text)
<sentences count="{count($sentences)}">{
for $s at $n in $sentences
<s n="{$n}">{$s}</s>
<sentences count="10">
<s n="1">154613.</s>
<s n="2">You should arrange to deliver following note to North Vietnamese Embassy.</s>
<s n="3">If in your opinion it can be done without creating an issue, we would prefer that you
ask North Vietnamese Charge to come to your Embassy to receive note.</s>
<s n="4">“The U.S. Government agrees with the statement of the Government of the DRV, in its
note of April 27, that it is necessary for Hanoi and Washington to engage in conversations
<s n="5">The U.S. Government notes that the DRV has now agreed that representatives of the two
countries should hold private discussions for the sole purpose of agreeing on a location and
<s n="6">The U.S. Government notes that the DRV did not respond to its suggestion of April 23
that we meet for this limited purpose in a ‘capital not previously considered by either
<s n="7">The U.S. Government suggested the DRV might wish to indicate three appropriate
locations suitable for this limited purpose.</s>
<s n="8">The U.S. Government does not consider that the suggestion of Warsaw is responsive or
<s n="9">The U.S. Government is prepared for these limited discussions on April 30 or several
days thereafter.</s>
<s n="10">The U.S. Government would welcome the prompt response of the DRV to this
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment