Skip to content

Instantly share code, notes, and snippets.

@joewiz
Last active April 20, 2022 11:23
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save joewiz/0a7e1f9b69169fc907fb482d28f24947 to your computer and use it in GitHub Desktop.
Save joewiz/0a7e1f9b69169fc907fb482d28f24947 to your computer and use it in GitHub Desktop.
Split (or "tokenize") a string into "sentences", with XQuery. See https://gist.github.com/joewiz/5889711
xquery version "3.1";
(: Use the eXist Stanford NLP package for sentence tokenization.
: Compared to my original "naïve" approach, this approach takes a quarter the number of lines of XQuery code.
: See https://gist.github.com/joewiz/5889711 :)
import module namespace nlp="http://exist-db.org/xquery/stanford-nlp";
declare function local:tokenize-sentences($text as xs:string) {
local:tokenize-sentences($text, map{})
};
declare function local:tokenize-sentences($text as xs:string, $options as map(*)) {
let $default-options := map { "annotators": [ "tokenize", "ssplit" ] }
let $opts := map:merge(($options, $default-options))
for $sentence in nlp:parse($text, $opts)//sentence
return
string-join(
for $token in $sentence/tokens/token
let $start-offset := $token/CharacterOffsetBegin cast as xs:integer + 1
let $end-offset :=
let $next-token := $token/following-sibling::token[1]
return
if ($next-token) then
$next-token/CharacterOffsetBegin cast as xs:integer
else
$token/CharacterOffsetEnd cast as xs:integer
let $length := $end-offset - $start-offset + 1
return
substring($text, $start-offset, $length)
) => normalize-space()
};
(: sample text taken from http://history.state.gov/historicaldocuments/frus1964-68v06/d213 :)
let $source-text :=
'154613. You should arrange to deliver following note to North Vietnamese Embassy.
If in your opinion it can be done without creating an issue, we would prefer that
you ask North Vietnamese Charge to come to your Embassy to receive note. “The U.S.
Government agrees with the statement of the Government of the DRV, in its note of
April 27, that it is necessary for Hanoi and Washington to engage in conversations
promptly. The U.S. Government notes that the DRV has now agreed that representatives
of the two countries should hold private discussions for the sole purpose of
agreeing on a location and date. The U.S. Government notes that the DRV did not
respond to its suggestion of April 23 that we meet for this limited purpose in a
‘capital not previously considered by either side.’ The U.S. Government suggested
the DRV might wish to indicate three appropriate locations suitable for this limited
purpose. The U.S. Government does not consider that the suggestion of Warsaw is
responsive or acceptable. The U.S. Government is prepared for these limited discussions
on April 30 or several days thereafter. The U.S. Government would welcome the prompt
response of the DRV to this suggestion.”'
let $sentences := local:tokenize-sentences($source-text)
return
<sentences count="{count($sentences)}">{
for $s at $n in $sentences
return
<s n="{$n}">{$s}</s>
}</sentences>
<sentences count="10">
<s n="1">154613.</s>
<s n="2">You should arrange to deliver following note to North Vietnamese Embassy.</s>
<s n="3">If in your opinion it can be done without creating an issue, we would prefer that you
ask North Vietnamese Charge to come to your Embassy to receive note.</s>
<s n="4">“The U.S. Government agrees with the statement of the Government of the DRV, in its
note of April 27, that it is necessary for Hanoi and Washington to engage in conversations
promptly.</s>
<s n="5">The U.S. Government notes that the DRV has now agreed that representatives of the two
countries should hold private discussions for the sole purpose of agreeing on a location and
date.</s>
<s n="6">The U.S. Government notes that the DRV did not respond to its suggestion of April 23
that we meet for this limited purpose in a ‘capital not previously considered by either
side.’</s>
<s n="7">The U.S. Government suggested the DRV might wish to indicate three appropriate
locations suitable for this limited purpose.</s>
<s n="8">The U.S. Government does not consider that the suggestion of Warsaw is responsive or
acceptable.</s>
<s n="9">The U.S. Government is prepared for these limited discussions on April 30 or several
days thereafter.</s>
<s n="10">The U.S. Government would welcome the prompt response of the DRV to this
suggestion.”</s>
</sentences>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment