Split (or "tokenize") a string into "sentences", with XQuery. See https://gist.github.com/joewiz/5889711
xquery version "3.1"; | |
(: Use the eXist Stanford NLP package for sentence tokenization. | |
: Compared to my original "naïve" approach, this approach takes a quarter the number of lines of XQuery code. | |
: See https://gist.github.com/joewiz/5889711 :) | |
import module namespace nlp="http://exist-db.org/xquery/stanford-nlp"; | |
declare function local:tokenize-sentences($text as xs:string) { | |
local:tokenize-sentences($text, map{}) | |
}; | |
declare function local:tokenize-sentences($text as xs:string, $options as map(*)) { | |
let $default-options := map { "annotators": [ "tokenize", "ssplit" ] } | |
let $opts := map:merge(($options, $default-options)) | |
for $sentence in nlp:parse($text, $opts)//sentence | |
return | |
string-join( | |
for $token in $sentence/tokens/token | |
let $start-offset := $token/CharacterOffsetBegin cast as xs:integer + 1 | |
let $end-offset := | |
let $next-token := $token/following-sibling::token[1] | |
return | |
if ($next-token) then | |
$next-token/CharacterOffsetBegin cast as xs:integer | |
else | |
$token/CharacterOffsetEnd cast as xs:integer | |
let $length := $end-offset - $start-offset + 1 | |
return | |
substring($text, $start-offset, $length) | |
) => normalize-space() | |
}; | |
(: sample text taken from http://history.state.gov/historicaldocuments/frus1964-68v06/d213 :) | |
let $source-text := | |
'154613. You should arrange to deliver following note to North Vietnamese Embassy. | |
If in your opinion it can be done without creating an issue, we would prefer that | |
you ask North Vietnamese Charge to come to your Embassy to receive note. “The U.S. | |
Government agrees with the statement of the Government of the DRV, in its note of | |
April 27, that it is necessary for Hanoi and Washington to engage in conversations | |
promptly. The U.S. Government notes that the DRV has now agreed that representatives | |
of the two countries should hold private discussions for the sole purpose of | |
agreeing on a location and date. The U.S. Government notes that the DRV did not | |
respond to its suggestion of April 23 that we meet for this limited purpose in a | |
‘capital not previously considered by either side.’ The U.S. Government suggested | |
the DRV might wish to indicate three appropriate locations suitable for this limited | |
purpose. The U.S. Government does not consider that the suggestion of Warsaw is | |
responsive or acceptable. The U.S. Government is prepared for these limited discussions | |
on April 30 or several days thereafter. The U.S. Government would welcome the prompt | |
response of the DRV to this suggestion.”' | |
let $sentences := local:tokenize-sentences($source-text) | |
return | |
<sentences count="{count($sentences)}">{ | |
for $s at $n in $sentences | |
return | |
<s n="{$n}">{$s}</s> | |
}</sentences> |
<sentences count="10"> | |
<s n="1">154613.</s> | |
<s n="2">You should arrange to deliver following note to North Vietnamese Embassy.</s> | |
<s n="3">If in your opinion it can be done without creating an issue, we would prefer that you | |
ask North Vietnamese Charge to come to your Embassy to receive note.</s> | |
<s n="4">“The U.S. Government agrees with the statement of the Government of the DRV, in its | |
note of April 27, that it is necessary for Hanoi and Washington to engage in conversations | |
promptly.</s> | |
<s n="5">The U.S. Government notes that the DRV has now agreed that representatives of the two | |
countries should hold private discussions for the sole purpose of agreeing on a location and | |
date.</s> | |
<s n="6">The U.S. Government notes that the DRV did not respond to its suggestion of April 23 | |
that we meet for this limited purpose in a ‘capital not previously considered by either | |
side.’</s> | |
<s n="7">The U.S. Government suggested the DRV might wish to indicate three appropriate | |
locations suitable for this limited purpose.</s> | |
<s n="8">The U.S. Government does not consider that the suggestion of Warsaw is responsive or | |
acceptable.</s> | |
<s n="9">The U.S. Government is prepared for these limited discussions on April 30 or several | |
days thereafter.</s> | |
<s n="10">The U.S. Government would welcome the prompt response of the DRV to this | |
suggestion.”</s> | |
</sentences> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment