Split (or "tokenize") a string into "sentences", with XQuery. See https://gist.github.com/joewiz/5889711
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
xquery version "3.1"; | |
(: Use the eXist Stanford NLP package for sentence tokenization. | |
: Compared to my original "naïve" approach, this approach takes a quarter the number of lines of XQuery code. | |
: See https://gist.github.com/joewiz/5889711 :) | |
import module namespace nlp="http://exist-db.org/xquery/stanford-nlp"; | |
declare function local:tokenize-sentences($text as xs:string) { | |
local:tokenize-sentences($text, map{}) | |
}; | |
declare function local:tokenize-sentences($text as xs:string, $options as map(*)) { | |
let $default-options := map { "annotators": [ "tokenize", "ssplit" ] } | |
let $opts := map:merge(($options, $default-options)) | |
for $sentence in nlp:parse($text, $opts)//sentence | |
return | |
string-join( | |
for $token in $sentence/tokens/token | |
let $start-offset := $token/CharacterOffsetBegin cast as xs:integer + 1 | |
let $end-offset := | |
let $next-token := $token/following-sibling::token[1] | |
return | |
if ($next-token) then | |
$next-token/CharacterOffsetBegin cast as xs:integer | |
else | |
$token/CharacterOffsetEnd cast as xs:integer | |
let $length := $end-offset - $start-offset + 1 | |
return | |
substring($text, $start-offset, $length) | |
) => normalize-space() | |
}; | |
(: sample text taken from http://history.state.gov/historicaldocuments/frus1964-68v06/d213 :) | |
let $source-text := | |
'154613. You should arrange to deliver following note to North Vietnamese Embassy. | |
If in your opinion it can be done without creating an issue, we would prefer that | |
you ask North Vietnamese Charge to come to your Embassy to receive note. “The U.S. | |
Government agrees with the statement of the Government of the DRV, in its note of | |
April 27, that it is necessary for Hanoi and Washington to engage in conversations | |
promptly. The U.S. Government notes that the DRV has now agreed that representatives | |
of the two countries should hold private discussions for the sole purpose of | |
agreeing on a location and date. The U.S. Government notes that the DRV did not | |
respond to its suggestion of April 23 that we meet for this limited purpose in a | |
‘capital not previously considered by either side.’ The U.S. Government suggested | |
the DRV might wish to indicate three appropriate locations suitable for this limited | |
purpose. The U.S. Government does not consider that the suggestion of Warsaw is | |
responsive or acceptable. The U.S. Government is prepared for these limited discussions | |
on April 30 or several days thereafter. The U.S. Government would welcome the prompt | |
response of the DRV to this suggestion.”' | |
let $sentences := local:tokenize-sentences($source-text) | |
return | |
<sentences count="{count($sentences)}">{ | |
for $s at $n in $sentences | |
return | |
<s n="{$n}">{$s}</s> | |
}</sentences> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<sentences count="10"> | |
<s n="1">154613.</s> | |
<s n="2">You should arrange to deliver following note to North Vietnamese Embassy.</s> | |
<s n="3">If in your opinion it can be done without creating an issue, we would prefer that you | |
ask North Vietnamese Charge to come to your Embassy to receive note.</s> | |
<s n="4">“The U.S. Government agrees with the statement of the Government of the DRV, in its | |
note of April 27, that it is necessary for Hanoi and Washington to engage in conversations | |
promptly.</s> | |
<s n="5">The U.S. Government notes that the DRV has now agreed that representatives of the two | |
countries should hold private discussions for the sole purpose of agreeing on a location and | |
date.</s> | |
<s n="6">The U.S. Government notes that the DRV did not respond to its suggestion of April 23 | |
that we meet for this limited purpose in a ‘capital not previously considered by either | |
side.’</s> | |
<s n="7">The U.S. Government suggested the DRV might wish to indicate three appropriate | |
locations suitable for this limited purpose.</s> | |
<s n="8">The U.S. Government does not consider that the suggestion of Warsaw is responsive or | |
acceptable.</s> | |
<s n="9">The U.S. Government is prepared for these limited discussions on April 30 or several | |
days thereafter.</s> | |
<s n="10">The U.S. Government would welcome the prompt response of the DRV to this | |
suggestion.”</s> | |
</sentences> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment