joewiz/tokenize-sentences.xq

## tokenize-sentences.xq
xquery version "1.0";

(: A naive approach to sentence tokenization inspired by http://stackoverflow.com/a/2103653/659732
 :
 : Works well with edited text like newspapers. Parameters like punctuation can/should be edited;
 : see the section below called "criteria".
 :
 : For a more sophisticated approach, see Tibor Kiss and Jan Strunk, "Unsupervised Multilingual
 : Sentence Boundary Detection", Computational Linguistics, Volume 32, Issue 4, December 2006,
 : pp. 485-525.  Also, see these discussions of sentence tokenization:
 : - http://nltk.org/book/ch06.html#sec-further-examples-of-supervised-classification
 : - http://www.robincamille.com/2012-02-18-nltk-sentence-tokenizer/
 :)

declare function local:tokenize-sentences($string as xs:string*)
{
    let $words := tokenize($string, '\s+')[. ne '']
    let $first-sentence := normalize-space(local:get-first-sentence($words, ''))
    return
        ($first-sentence,
        let $word-count-of-sentence := count(tokenize($first-sentence, ' '))
        return
            if (count($words) gt $word-count-of-sentence) then
                local:tokenize-sentences(string-join(subsequence($words, $word-count-of-sentence + 1), ' '))
            else
                ()
        )
};

declare function local:get-first-sentence($words as xs:string*, $sentence as xs:string) {

    (: if there are no (more) words to check, we're done, so return whatever we have for the sentence :)
    if (empty($words)) then
        $sentence

    (: begin analyzing the word :)
    else
        let $word := subsequence($words, 1, 1)
        let $next := subsequence($words, 2, 1)
        let $rest := subsequence($words, 2)

        (: criteria :)
        let $final-punctuation-marks := '.?!'
        let $post-punctuation-possibilities := '’”"'')'
        let $pre-punctuation-possibilities := '‘“"''('
        let $final-punctuation-regex := concat('[', $final-punctuation-marks, '][', $post-punctuation-possibilities, ']?$')
        let $capitalized-abbreviation-test-regex := '[A-Z][.?!]'
        let $capitalized-test-regex := concat('^[', $pre-punctuation-possibilities, ']*?[A-Z]')
        let $words-with-ignorable-final-punctuation-marks := ('Mr.', 'Mrs.', 'Dr.', 'Amb.')
        let $known-phrases-with-ignorable-final-punctuation-marks := ('U.S. Government')

        (: test the word against the criteria :)
        let $word-ends-with-punctuation := matches($word, $final-punctuation-regex)
        let $word-is-capitalized-abbreviation := matches($word, $capitalized-abbreviation-test-regex)
        let $next-word-is-capitalized := matches($next, $capitalized-test-regex)
        let $word-has-ignorable-punctuation := $word = $words-with-ignorable-final-punctuation-marks

        return

            (: if word doesn't end with punctuation (like "the" or "Minister"),
               then consider it part of the existing sentence and move to the next word. :)
            if (not($word-ends-with-punctuation)) then
                local:get-first-sentence(
                    $rest,
                    concat($sentence, ' ', $word)
                    )

            (: if the word is in our list of words with allowable final punctuation (like "Mr."),
               then consider it part of the existing sentence and move to the next word. :)
            else if ($word-has-ignorable-punctuation) then
                local:get-first-sentence(
                    $rest,
                    concat($sentence, ' ', $word)
                    )

            (: if the word is an abbreviation and the next word is not capitalized (like "A.B.M. treaty"),
               or if the word ends with punctuation and the next word is not capitalized (like "'What?' he asked.")
               then consider it part of the existing sentence and move to the next word. :)
            else if (($word-is-capitalized-abbreviation or $word-ends-with-punctuation) and not($next-word-is-capitalized)) then
                local:get-first-sentence(
                    $rest,
                    concat($sentence, ' ', $word)
                    )

            (: if the word is part of a known phrase that could be mistaken for the end of a sentence (like "U.S. Government"),
               then consider it part of the existing sentence and move to the next word. :)
            else
                let $sorted-phrases :=
                    (: order by word length, longest to shortest :)
                    for $phrase in $known-phrases-with-ignorable-final-punctuation-marks
                    order by string-length($phrase) descending
                    return $phrase
                let $words-as-string := string-join($words, ' ')
                let $matching-phrase :=
                    subsequence(
                        for $phrase in $sorted-phrases
                        return
                            if (starts-with($words-as-string, $phrase)) then
                                $phrase
                            else ()
                        , 1, 1)
                return
                    if ($matching-phrase) then
                        let $phrase-length := count(tokenize($matching-phrase, ' '))
                        let $rest := subsequence($words, $phrase-length + 1)
                        return
                            local:get-first-sentence(
                                $rest,
                                concat($sentence, ' ', $matching-phrase)
                                )

                    (: the word ends the sentence - we're done with this sentence! :)
                    else
                        concat($sentence, ' ', $word)
};

(: sample text taken from http://history.state.gov/historicaldocuments/frus1964-68v06/d213 :)

let $source-text :=
    '154613. You should arrange to deliver following note to North Vietnamese Embassy.
    If in your opinion it can be done without creating an issue, we would prefer that
    you ask North Vietnamese Charge to come to your Embassy to receive note. “The U.S.
    Government agrees with the statement of the Government of the DRV, in its note of
    April 27, that it is necessary for Hanoi and Washington to engage in conversations
    promptly. The U.S. Government notes that the DRV has now agreed that representatives
    of the two countries should hold private discussions for the sole purpose of
    agreeing on a location and date. The U.S. Government notes that the DRV did not
    respond to its suggestion of April 23 that we meet for this limited purpose in a
    ‘capital not previously considered by either side.’ The U.S. Government suggested
    the DRV might wish to indicate three appropriate locations suitable for this limited
    purpose. The U.S. Government does not consider that the suggestion of Warsaw is
    responsive or acceptable. The U.S. Government is prepared for these limited discussions
    on April 30 or several days thereafter. The U.S. Government would welcome the prompt
    response of the DRV to this suggestion.”'
let $sentences := local:tokenize-sentences($source-text)
return
    <p sentence-count="{count($sentences)}">{
        for $sentence at $n in $sentences
        return
            <s n="{$n}">{$sentence}</s>
    }</p>

(: should return:

    <p sentence-count="10">
        <s n="1">154613.</s>
        <s n="2">You should arrange to deliver following note to North Vietnamese Embassy.</s>
        <s n="3">If in your opinion it can be done without creating an issue, we would prefer that you
            ask North Vietnamese Charge to come to your Embassy to receive note.</s>
        <s n="4">“The U.S. Government agrees with the statement of the Government of the DRV, in its
            note of April 27, that it is necessary for Hanoi and Washington to engage in conversations
            promptly.</s>
        <s n="5">The U.S. Government notes that the DRV has now agreed that representatives of the two
            countries should hold private discussions for the sole purpose of agreeing on a location and
            date.</s>
        <s n="6">The U.S. Government notes that the DRV did not respond to its suggestion of April 23
            that we meet for this limited purpose in a ‘capital not previously considered by either
            side.’</s>
        <s n="7">The U.S. Government suggested the DRV might wish to indicate three appropriate
            locations suitable for this limited purpose.</s>
        <s n="8">The U.S. Government does not consider that the suggestion of Warsaw is responsive or
            acceptable.</s>
        <s n="9">The U.S. Government is prepared for these limited discussions on April 30 or several
            days thereafter.</s>
        <s n="10">The U.S. Government would welcome the prompt response of the DRV to this
            suggestion.”</s>
    </p>
:)
	xquery version "1.0";

	(: A naive approach to sentence tokenization inspired by http://stackoverflow.com/a/2103653/659732
	:
	: Works well with edited text like newspapers. Parameters like punctuation can/should be edited;
	: see the section below called "criteria".
	:
	: For a more sophisticated approach, see Tibor Kiss and Jan Strunk, "Unsupervised Multilingual
	: Sentence Boundary Detection", Computational Linguistics, Volume 32, Issue 4, December 2006,
	: pp. 485-525. Also, see these discussions of sentence tokenization:
	: - http://nltk.org/book/ch06.html#sec-further-examples-of-supervised-classification
	: - http://www.robincamille.com/2012-02-18-nltk-sentence-tokenizer/
	:)

	declare function local:tokenize-sentences($string as xs:string*)
	{
	let $words := tokenize($string, '\s+')[. ne '']
	let $first-sentence := normalize-space(local:get-first-sentence($words, ''))
	return
	($first-sentence,
	let $word-count-of-sentence := count(tokenize($first-sentence, ' '))
	return
	if (count($words) gt $word-count-of-sentence) then
	local:tokenize-sentences(string-join(subsequence($words, $word-count-of-sentence + 1), ' '))
	else
	()
	)
	};

	declare function local:get-first-sentence($words as xs:string*, $sentence as xs:string) {

	(: if there are no (more) words to check, we're done, so return whatever we have for the sentence :)
	if (empty($words)) then
	$sentence

	(: begin analyzing the word :)
	else
	let $word := subsequence($words, 1, 1)
	let $next := subsequence($words, 2, 1)
	let $rest := subsequence($words, 2)

	(: criteria :)
	let $final-punctuation-marks := '.?!'
	let $post-punctuation-possibilities := '’”"'')'
	let $pre-punctuation-possibilities := '‘“"''('
	let $final-punctuation-regex := concat('[', $final-punctuation-marks, '][', $post-punctuation-possibilities, ']?$')
	let $capitalized-abbreviation-test-regex := '[A-Z][.?!]'
	let $capitalized-test-regex := concat('^[', $pre-punctuation-possibilities, ']*?[A-Z]')
	let $words-with-ignorable-final-punctuation-marks := ('Mr.', 'Mrs.', 'Dr.', 'Amb.')
	let $known-phrases-with-ignorable-final-punctuation-marks := ('U.S. Government')

	(: test the word against the criteria :)
	let $word-ends-with-punctuation := matches($word, $final-punctuation-regex)
	let $word-is-capitalized-abbreviation := matches($word, $capitalized-abbreviation-test-regex)
	let $next-word-is-capitalized := matches($next, $capitalized-test-regex)
	let $word-has-ignorable-punctuation := $word = $words-with-ignorable-final-punctuation-marks

	return

	(: if word doesn't end with punctuation (like "the" or "Minister"),
	then consider it part of the existing sentence and move to the next word. :)
	if (not($word-ends-with-punctuation)) then
	local:get-first-sentence(
	$rest,
	concat($sentence, ' ', $word)
	)

	(: if the word is in our list of words with allowable final punctuation (like "Mr."),
	then consider it part of the existing sentence and move to the next word. :)
	else if ($word-has-ignorable-punctuation) then
	local:get-first-sentence(
	$rest,
	concat($sentence, ' ', $word)
	)

	(: if the word is an abbreviation and the next word is not capitalized (like "A.B.M. treaty"),
	or if the word ends with punctuation and the next word is not capitalized (like "'What?' he asked.")
	then consider it part of the existing sentence and move to the next word. :)
	else if (($word-is-capitalized-abbreviation or $word-ends-with-punctuation) and not($next-word-is-capitalized)) then
	local:get-first-sentence(
	$rest,
	concat($sentence, ' ', $word)
	)

	(: if the word is part of a known phrase that could be mistaken for the end of a sentence (like "U.S. Government"),
	then consider it part of the existing sentence and move to the next word. :)
	else
	let $sorted-phrases :=
	(: order by word length, longest to shortest :)
	for $phrase in $known-phrases-with-ignorable-final-punctuation-marks
	order by string-length($phrase) descending
	return $phrase
	let $words-as-string := string-join($words, ' ')
	let $matching-phrase :=
	subsequence(
	for $phrase in $sorted-phrases
	return
	if (starts-with($words-as-string, $phrase)) then
	$phrase
	else ()
	, 1, 1)
	return
	if ($matching-phrase) then
	let $phrase-length := count(tokenize($matching-phrase, ' '))
	let $rest := subsequence($words, $phrase-length + 1)
	return
	local:get-first-sentence(
	$rest,
	concat($sentence, ' ', $matching-phrase)
	)

	(: the word ends the sentence - we're done with this sentence! :)
	else
	concat($sentence, ' ', $word)
	};

	(: sample text taken from http://history.state.gov/historicaldocuments/frus1964-68v06/d213 :)

	let $source-text :=
	'154613. You should arrange to deliver following note to North Vietnamese Embassy.
	If in your opinion it can be done without creating an issue, we would prefer that
	you ask North Vietnamese Charge to come to your Embassy to receive note. “The U.S.
	Government agrees with the statement of the Government of the DRV, in its note of
	April 27, that it is necessary for Hanoi and Washington to engage in conversations
	promptly. The U.S. Government notes that the DRV has now agreed that representatives
	of the two countries should hold private discussions for the sole purpose of
	agreeing on a location and date. The U.S. Government notes that the DRV did not
	respond to its suggestion of April 23 that we meet for this limited purpose in a
	‘capital not previously considered by either side.’ The U.S. Government suggested
	the DRV might wish to indicate three appropriate locations suitable for this limited
	purpose. The U.S. Government does not consider that the suggestion of Warsaw is
	responsive or acceptable. The U.S. Government is prepared for these limited discussions
	on April 30 or several days thereafter. The U.S. Government would welcome the prompt
	response of the DRV to this suggestion.”'
	let $sentences := local:tokenize-sentences($source-text)
	return
	<p sentence-count="{count($sentences)}">{
	for $sentence at $n in $sentences
	return
	<s n="{$n}">{$sentence}</s>
	}</p>

	(: should return:

	<p sentence-count="10">
	<s n="1">154613.</s>
	<s n="2">You should arrange to deliver following note to North Vietnamese Embassy.</s>
	<s n="3">If in your opinion it can be done without creating an issue, we would prefer that you
	ask North Vietnamese Charge to come to your Embassy to receive note.</s>
	<s n="4">“The U.S. Government agrees with the statement of the Government of the DRV, in its
	note of April 27, that it is necessary for Hanoi and Washington to engage in conversations
	promptly.</s>
	<s n="5">The U.S. Government notes that the DRV has now agreed that representatives of the two
	countries should hold private discussions for the sole purpose of agreeing on a location and
	date.</s>
	<s n="6">The U.S. Government notes that the DRV did not respond to its suggestion of April 23
	that we meet for this limited purpose in a ‘capital not previously considered by either
	side.’</s>
	<s n="7">The U.S. Government suggested the DRV might wish to indicate three appropriate
	locations suitable for this limited purpose.</s>
	<s n="8">The U.S. Government does not consider that the suggestion of Warsaw is responsive or
	acceptable.</s>
	<s n="9">The U.S. Government is prepared for these limited discussions on April 30 or several
	days thereafter.</s>
	<s n="10">The U.S. Government would welcome the prompt response of the DRV to this
	suggestion.”</s>
	</p>
	:)