Split (or "tokenize") a string into "sentences", with XQuery. See http://joewiz.org/2013/06/29/one-paragraph-many-sentences/.
xquery version "1.0"; | |
(: A naive approach to sentence tokenization inspired by http://stackoverflow.com/a/2103653/659732 | |
: | |
: Works well with edited text like newspapers. Parameters like punctuation can/should be edited; | |
: see the section below called "criteria". | |
: | |
: For a more sophisticated approach, see Tibor Kiss and Jan Strunk, "Unsupervised Multilingual | |
: Sentence Boundary Detection", Computational Linguistics, Volume 32, Issue 4, December 2006, | |
: pp. 485-525. Also, see these discussions of sentence tokenization: | |
: - http://nltk.org/book/ch06.html#sec-further-examples-of-supervised-classification | |
: - http://www.robincamille.com/2012-02-18-nltk-sentence-tokenizer/ | |
:) | |
declare function local:tokenize-sentences($string as xs:string*) | |
{ | |
let $words := tokenize($string, '\s+')[. ne ''] | |
let $first-sentence := normalize-space(local:get-first-sentence($words, '')) | |
return | |
($first-sentence, | |
let $word-count-of-sentence := count(tokenize($first-sentence, ' ')) | |
return | |
if (count($words) gt $word-count-of-sentence) then | |
local:tokenize-sentences(string-join(subsequence($words, $word-count-of-sentence + 1), ' ')) | |
else | |
() | |
) | |
}; | |
declare function local:get-first-sentence($words as xs:string*, $sentence as xs:string) { | |
(: if there are no (more) words to check, we're done, so return whatever we have for the sentence :) | |
if (empty($words)) then | |
$sentence | |
(: begin analyzing the word :) | |
else | |
let $word := subsequence($words, 1, 1) | |
let $next := subsequence($words, 2, 1) | |
let $rest := subsequence($words, 2) | |
(: criteria :) | |
let $final-punctuation-marks := '.?!' | |
let $post-punctuation-possibilities := '’”"'')' | |
let $pre-punctuation-possibilities := '‘“"''(' | |
let $final-punctuation-regex := concat('[', $final-punctuation-marks, '][', $post-punctuation-possibilities, ']?$') | |
let $capitalized-abbreviation-test-regex := '[A-Z][.?!]' | |
let $capitalized-test-regex := concat('^[', $pre-punctuation-possibilities, ']*?[A-Z]') | |
let $words-with-ignorable-final-punctuation-marks := ('Mr.', 'Mrs.', 'Dr.', 'Amb.') | |
let $known-phrases-with-ignorable-final-punctuation-marks := ('U.S. Government') | |
(: test the word against the criteria :) | |
let $word-ends-with-punctuation := matches($word, $final-punctuation-regex) | |
let $word-is-capitalized-abbreviation := matches($word, $capitalized-abbreviation-test-regex) | |
let $next-word-is-capitalized := matches($next, $capitalized-test-regex) | |
let $word-has-ignorable-punctuation := $word = $words-with-ignorable-final-punctuation-marks | |
return | |
(: if word doesn't end with punctuation (like "the" or "Minister"), | |
then consider it part of the existing sentence and move to the next word. :) | |
if (not($word-ends-with-punctuation)) then | |
local:get-first-sentence( | |
$rest, | |
concat($sentence, ' ', $word) | |
) | |
(: if the word is in our list of words with allowable final punctuation (like "Mr."), | |
then consider it part of the existing sentence and move to the next word. :) | |
else if ($word-has-ignorable-punctuation) then | |
local:get-first-sentence( | |
$rest, | |
concat($sentence, ' ', $word) | |
) | |
(: if the word is an abbreviation and the next word is not capitalized (like "A.B.M. treaty"), | |
or if the word ends with punctuation and the next word is not capitalized (like "'What?' he asked.") | |
then consider it part of the existing sentence and move to the next word. :) | |
else if (($word-is-capitalized-abbreviation or $word-ends-with-punctuation) and not($next-word-is-capitalized)) then | |
local:get-first-sentence( | |
$rest, | |
concat($sentence, ' ', $word) | |
) | |
(: if the word is part of a known phrase that could be mistaken for the end of a sentence (like "U.S. Government"), | |
then consider it part of the existing sentence and move to the next word. :) | |
else | |
let $sorted-phrases := | |
(: order by word length, longest to shortest :) | |
for $phrase in $known-phrases-with-ignorable-final-punctuation-marks | |
order by string-length($phrase) descending | |
return $phrase | |
let $words-as-string := string-join($words, ' ') | |
let $matching-phrase := | |
subsequence( | |
for $phrase in $sorted-phrases | |
return | |
if (starts-with($words-as-string, $phrase)) then | |
$phrase | |
else () | |
, 1, 1) | |
return | |
if ($matching-phrase) then | |
let $phrase-length := count(tokenize($matching-phrase, ' ')) | |
let $rest := subsequence($words, $phrase-length + 1) | |
return | |
local:get-first-sentence( | |
$rest, | |
concat($sentence, ' ', $matching-phrase) | |
) | |
(: the word ends the sentence - we're done with this sentence! :) | |
else | |
concat($sentence, ' ', $word) | |
}; | |
(: sample text taken from http://history.state.gov/historicaldocuments/frus1964-68v06/d213 :) | |
let $source-text := | |
'154613. You should arrange to deliver following note to North Vietnamese Embassy. | |
If in your opinion it can be done without creating an issue, we would prefer that | |
you ask North Vietnamese Charge to come to your Embassy to receive note. “The U.S. | |
Government agrees with the statement of the Government of the DRV, in its note of | |
April 27, that it is necessary for Hanoi and Washington to engage in conversations | |
promptly. The U.S. Government notes that the DRV has now agreed that representatives | |
of the two countries should hold private discussions for the sole purpose of | |
agreeing on a location and date. The U.S. Government notes that the DRV did not | |
respond to its suggestion of April 23 that we meet for this limited purpose in a | |
‘capital not previously considered by either side.’ The U.S. Government suggested | |
the DRV might wish to indicate three appropriate locations suitable for this limited | |
purpose. The U.S. Government does not consider that the suggestion of Warsaw is | |
responsive or acceptable. The U.S. Government is prepared for these limited discussions | |
on April 30 or several days thereafter. The U.S. Government would welcome the prompt | |
response of the DRV to this suggestion.”' | |
let $sentences := local:tokenize-sentences($source-text) | |
return | |
<p sentence-count="{count($sentences)}">{ | |
for $sentence at $n in $sentences | |
return | |
<s n="{$n}">{$sentence}</s> | |
}</p> | |
(: should return: | |
<p sentence-count="10"> | |
<s n="1">154613.</s> | |
<s n="2">You should arrange to deliver following note to North Vietnamese Embassy.</s> | |
<s n="3">If in your opinion it can be done without creating an issue, we would prefer that you | |
ask North Vietnamese Charge to come to your Embassy to receive note.</s> | |
<s n="4">“The U.S. Government agrees with the statement of the Government of the DRV, in its | |
note of April 27, that it is necessary for Hanoi and Washington to engage in conversations | |
promptly.</s> | |
<s n="5">The U.S. Government notes that the DRV has now agreed that representatives of the two | |
countries should hold private discussions for the sole purpose of agreeing on a location and | |
date.</s> | |
<s n="6">The U.S. Government notes that the DRV did not respond to its suggestion of April 23 | |
that we meet for this limited purpose in a ‘capital not previously considered by either | |
side.’</s> | |
<s n="7">The U.S. Government suggested the DRV might wish to indicate three appropriate | |
locations suitable for this limited purpose.</s> | |
<s n="8">The U.S. Government does not consider that the suggestion of Warsaw is responsive or | |
acceptable.</s> | |
<s n="9">The U.S. Government is prepared for these limited discussions on April 30 or several | |
days thereafter.</s> | |
<s n="10">The U.S. Government would welcome the prompt response of the DRV to this | |
suggestion.”</s> | |
</p> | |
:) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment