desilinguist/tokenize.doctest

## tokenize.doctest
.. Copyright (C) 2001-2012 NLTK Project
.. For license information, see LICENSE.TXT

    >>> from nltk.tokenize import *

Regression Tests: Treebank Tokenizer
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Some test strings.

    >>> s1 = "On a $50,000 mortgage of 30 years at 8 percent, the monthly payment would be $366.88."
    >>> print word_tokenize(s1)
    ['On', 'a', '$', '50,000', 'mortgage', 'of', '30', 'years', 'at', '8', 'percent', ',', 'the', 'monthly', 'payment', 'would', 'be', '$', '366.88', '.']
    >>> s2 = "\"We beat some pretty good teams to get here,\" Slocum said."
    >>> print word_tokenize(s2)
    ['``', 'We', 'beat', 'some', 'pretty', 'good', 'teams', 'to', 'get', 'here', ',', "''", 'Slocum', 'said', '.']
    >>> s3 = "Well, we couldn't have this predictable, cliche-ridden, \"Touched by an Angel\" (a show creator John Masius worked on) wanna-be if she didn't."
    >>> print word_tokenize(s3)
    ['Well', ',', 'we', 'could', "n't", 'have', 'this', 'predictable', ',', 'cliche-ridden', ',', '``', 'Touched', 'by', 'an', 'Angel', "''", '(', 'a', 'show', 'creator', 'John', 'Masius', 'worked', 'on', ')', 'wanna-be', 'if', 'she', 'did', "n't", '.']
    >>> s4 = "I cannot cannot work under these conditions!"
    >>> print word_tokenize(s4)
    ['I', 'can', 'not', 'can', 'not', 'work', 'under', 'these', 'conditions', '!']
    >>> s5 = "The company spent $30,000,000 last year."
    >>> print word_tokenize(s5)
    ['The', 'company', 'spent', '$', '30,000,000', 'last', 'year', '.']
    >>> s6 = "The company spent 40.75% of its income last year."
    >>> print word_tokenize(s6)
    ['The', 'company', 'spent', '40.75', '%', 'of', 'its', 'income', 'last', 'year', '.']
    >>> s7 = "He arrived at 3:00 pm."
    >>> print word_tokenize(s7)
    ['He', 'arrived', 'at', '3:00', 'pm', '.']
    >>> s8 = "I bought these items: books, pencils, and pens."
    >>> print word_tokenize(s8)
    ['I', 'bought', 'these', 'items', ':', 'books', ',', 'pencils', ',', 'and', 'pens', '.']
    >>> s9 = "Though there were 150, 100 of them were old."
    >>> print word_tokenize(s9)
    ['Though', 'there', 'were', '150', ',', '100', 'of', 'them', 'were', 'old', '.']
    >>> s10 = "There were 300,000, but that wasn't enough."
    >>> print word_tokenize(s10)
    ['There', 'were', '300,000', ',', 'but', 'that', 'was', "n't", 'enough', '.']

Regression Tests: Regexp Tokenizer
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Some additional test strings.

    >>> s = ("Good muffins cost $3.88\nin New York.  Please buy me\n"
    ...      "two of them.\n\nThanks.")
    >>> s2 = ("Alas, it has not rained today. When, do you think, "
    ...       "will it rain again?")
    >>> s3 = ("<p>Although this is <b>not</b> the case here, we must "
    ...       "not relax our vigilance!</p>")

    >>> print regexp_tokenize(s2, r'[,\.\?!"]\s*', gaps=False)
    [', ', '. ', ', ', ', ', '?']
    >>> print regexp_tokenize(s2, r'[,\.\?!"]\s*', gaps=True)
    ['Alas', 'it has not rained today', 'When', 'do you think',
     'will it rain again']

Make sure that grouping parentheses don't confuse the tokenizer:

    >>> print regexp_tokenize(s3, r'</?(b|p)>', gaps=False)
    ['<p>', '<b>', '</b>', '</p>']
    >>> print regexp_tokenize(s3, r'</?(b|p)>', gaps=True)
    ['Although this is ', 'not',
     ' the case here, we must not relax our vigilance!']

Make sure that named groups don't confuse the tokenizer:

    >>> print regexp_tokenize(s3, r'</?(?P<named>b|p)>', gaps=False)
    ['<p>', '<b>', '</b>', '</p>']
    >>> print regexp_tokenize(s3, r'</?(?P<named>b|p)>', gaps=True)
    ['Although this is ', 'not',
     ' the case here, we must not relax our vigilance!']

Make sure that nested groups don't confuse the tokenizer:

    >>> print regexp_tokenize(s2, r'(h|r|l)a(s|(i|n0))', gaps=False)
    ['las', 'has', 'rai', 'rai']
    >>> print regexp_tokenize(s2, r'(h|r|l)a(s|(i|n0))', gaps=True)
    ['A', ', it ', ' not ', 'ned today. When, do you think, will it ',
     'n again?']

The tokenizer should reject any patterns with backreferences:

    >>> print regexp_tokenize(s2, r'(.)\1')
    Traceback (most recent call last):
       ...
    ValueError: Regular expressions with back-references are
    not supported: '(.)\\1'
    >>> print regexp_tokenize(s2, r'(?P<foo>)(?P=foo)')
    Traceback (most recent call last):
       ...
    ValueError: Regular expressions with back-references are
    not supported: '(?P<foo>)(?P=foo)'

A simple sentence tokenizer '\.(\s+|$)'

    >>> print regexp_tokenize(s, pattern=r'\.(\s+|$)', gaps=True)
    ['Good muffins cost $3.88\nin New York',
     'Please buy me\ntwo of them', 'Thanks']

## treebank.py
# Natural Language Toolkit: Tokenizers
#
# Copyright (C) 2001-2012 NLTK Project
# Author: Edward Loper <edloper@gradient.cis.upenn.edu>
#         Michael Heilman <mheilman@cmu.edu> (re-port from http://www.cis.upenn.edu/~treebank/tokenizer.sed)
#
# URL: <http://nltk.sourceforge.net>
# For license information, see LICENSE.TXT

r"""

Penn Treebank Tokenizer

The Treebank tokenizer uses regular expressions to tokenize text as in Penn Treebank.
This implementation is a port of the tokenizer sed script written by Robert McIntyre
and available at http://www.cis.upenn.edu/~treebank/tokenizer.sed.
"""

import re
from nltk.tokenize.api import TokenizerI


class TreebankWordTokenizer(TokenizerI):
    """
    The Treebank tokenizer uses regular expressions to tokenize text as in Penn Treebank.
    This is the method that is invoked by ``word_tokenize()``.  It assumes that the
    text has already been segmented into sentences, e.g. using ``sent_tokenize()``.

    This tokenizer performs the following steps:

    - split standard contractions, e.g. ``don't`` -> ``do n't`` and ``they'll`` -> ``they 'll``
    - treat most punctuation characters as separate tokens
    - split off commas and single quotes, when followed by whitespace
    - separate periods that appear at the end of line

        >>> from nltk.tokenize import TreebankWordTokenizer
        >>> s = '''Good muffins cost $3.88\\nin New York.  Please buy me\\ntwo of them.\\n\\nThanks.'''
        >>> TreebankWordTokenizer().tokenize(s)
        ['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York.',
        'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']
        >>> s = "They'll save and invest more."
        >>> TreebankWordTokenizer().tokenize(s)
        ['They', "'ll", 'save', 'and', 'invest', 'more', '.']

    NB. this tokenizer assumes that the text is presented as one sentence per line,
    where each line is delimited with a newline character.
    The only periods to be treated as separate tokens are those appearing
    at the end of a line.
    """

    # List of contractions adapted from Robert MacIntyre's tokenizer.
    CONTRACTIONS2 = [re.compile(r"(?i)\b(can)(not)\b"),
                     re.compile(r"(?i)\b(d)('ye)\b"),
                     re.compile(r"(?i)\b(gim)(me)\b"),
                     re.compile(r"(?i)\b(gon)(na)\b"),
                     re.compile(r"(?i)\b(got)(ta)\b"),
                     re.compile(r"(?i)\b(lem)(me)\b"),
                     re.compile(r"(?i)\b(mor)('n)\b"),
                     re.compile(r"(?i)\b(wan)(na) ")]
    CONTRACTIONS3 = [re.compile(r"(?i) ('t)(is)\b"),
                     re.compile(r"(?i) ('t)(was)\b")]
    CONTRACTIONS4 = [re.compile(r"(?i)\b(whad)(dd)(ya)\b"),
                     re.compile(r"(?i)\b(wha)(t)(cha)\b")]

    def tokenize(self, text):
        #starting quotes
        text = re.sub(r'^\"', r'``', text)
        text = re.sub(r'(``)', r' \1 ', text)
        text = re.sub(r'([ (\[{<])"', r'\1 `` ', text)

        #punctuation
	text = re.sub(r'([:,])([^\d])', r' \1 \2', text)
        text = re.sub(r'\.\.\.', r' ... ', text)
        text = re.sub(r'[;@#$%&]', r' \g<0> ', text)
        text = re.sub(r'([^\.])(\.)([\]\)}>"\']*)\s*$', r'\1 \2\3 ', text)
        text = re.sub(r'[?!]', r' \g<0> ', text)

        text = re.sub(r"([^'])' ", r"\1 ' ", text)

        #parens, brackets, etc.
        text = re.sub(r'[\]\[\(\)\{\}\<\>]', r' \g<0> ', text)
        text = re.sub(r'--', r' -- ', text)

        #add extra space to make things easier
        text = " " + text + " "

        #ending quotes
        text = re.sub(r'"', " '' ", text)
        text = re.sub(r'(\S)(\'\')', r'\1 \2 ', text)

        text = re.sub(r"([^' ])('[sS]|'[mM]|'[dD]|') ", r"\1 \2 ", text)
        text = re.sub(r"([^' ])('ll|'re|'ve|n't|) ", r"\1 \2 ", text)
        text = re.sub(r"([^' ])('LL|'RE|'VE|N'T|) ", r"\1 \2 ", text)

        for regexp in self.CONTRACTIONS2:
            text = regexp.sub(r' \1 \2 ', text)
        for regexp in self.CONTRACTIONS3:
            text = regexp.sub(r' \1 \2 ', text)

        # We are not using CONTRACTIONS4 since
        # they are also commented out in the SED scripts
        # for regexp in self.CONTRACTIONS4:
        #     text = regexp.sub(r' \1 \2 \3 ', text)

        text = re.sub(" +", " ", text)
        text = text.strip()

        #add space at end to match up with MacIntyre's output (for debugging)
        if text != "":
            text += " "

        return text.split()

if __name__ == "__main__":
    import doctest
    doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)
	.. Copyright (C) 2001-2012 NLTK Project
	.. For license information, see LICENSE.TXT

	>>> from nltk.tokenize import *

	Regression Tests: Treebank Tokenizer
	~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

	Some test strings.

	>>> s1 = "On a $50,000 mortgage of 30 years at 8 percent, the monthly payment would be $366.88."
	>>> print word_tokenize(s1)
	['On', 'a', '$', '50,000', 'mortgage', 'of', '30', 'years', 'at', '8', 'percent', ',', 'the', 'monthly', 'payment', 'would', 'be', '$', '366.88', '.']
	>>> s2 = "\"We beat some pretty good teams to get here,\" Slocum said."
	>>> print word_tokenize(s2)
	['``', 'We', 'beat', 'some', 'pretty', 'good', 'teams', 'to', 'get', 'here', ',', "''", 'Slocum', 'said', '.']
	>>> s3 = "Well, we couldn't have this predictable, cliche-ridden, \"Touched by an Angel\" (a show creator John Masius worked on) wanna-be if she didn't."
	>>> print word_tokenize(s3)
	['Well', ',', 'we', 'could', "n't", 'have', 'this', 'predictable', ',', 'cliche-ridden', ',', '``', 'Touched', 'by', 'an', 'Angel', "''", '(', 'a', 'show', 'creator', 'John', 'Masius', 'worked', 'on', ')', 'wanna-be', 'if', 'she', 'did', "n't", '.']
	>>> s4 = "I cannot cannot work under these conditions!"
	>>> print word_tokenize(s4)
	['I', 'can', 'not', 'can', 'not', 'work', 'under', 'these', 'conditions', '!']
	>>> s5 = "The company spent $30,000,000 last year."
	>>> print word_tokenize(s5)
	['The', 'company', 'spent', '$', '30,000,000', 'last', 'year', '.']
	>>> s6 = "The company spent 40.75% of its income last year."
	>>> print word_tokenize(s6)
	['The', 'company', 'spent', '40.75', '%', 'of', 'its', 'income', 'last', 'year', '.']
	>>> s7 = "He arrived at 3:00 pm."
	>>> print word_tokenize(s7)
	['He', 'arrived', 'at', '3:00', 'pm', '.']
	>>> s8 = "I bought these items: books, pencils, and pens."
	>>> print word_tokenize(s8)
	['I', 'bought', 'these', 'items', ':', 'books', ',', 'pencils', ',', 'and', 'pens', '.']
	>>> s9 = "Though there were 150, 100 of them were old."
	>>> print word_tokenize(s9)
	['Though', 'there', 'were', '150', ',', '100', 'of', 'them', 'were', 'old', '.']
	>>> s10 = "There were 300,000, but that wasn't enough."
	>>> print word_tokenize(s10)
	['There', 'were', '300,000', ',', 'but', 'that', 'was', "n't", 'enough', '.']

	Regression Tests: Regexp Tokenizer
	~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

	Some additional test strings.

	>>> s = ("Good muffins cost $3.88\nin New York. Please buy me\n"
	... "two of them.\n\nThanks.")
	>>> s2 = ("Alas, it has not rained today. When, do you think, "
	... "will it rain again?")
	>>> s3 = ("<p>Although this is <b>not</b> the case here, we must "
	... "not relax our vigilance!</p>")

	>>> print regexp_tokenize(s2, r'[,\.\?!"]\s*', gaps=False)
	[', ', '. ', ', ', ', ', '?']
	>>> print regexp_tokenize(s2, r'[,\.\?!"]\s*', gaps=True)
	['Alas', 'it has not rained today', 'When', 'do you think',
	'will it rain again']

	Make sure that grouping parentheses don't confuse the tokenizer:

	>>> print regexp_tokenize(s3, r'</?(b\|p)>', gaps=False)
	['<p>', '<b>', '</b>', '</p>']
	>>> print regexp_tokenize(s3, r'</?(b\|p)>', gaps=True)
	['Although this is ', 'not',
	' the case here, we must not relax our vigilance!']

	Make sure that named groups don't confuse the tokenizer:

	>>> print regexp_tokenize(s3, r'</?(?P<named>b\|p)>', gaps=False)
	['<p>', '<b>', '</b>', '</p>']
	>>> print regexp_tokenize(s3, r'</?(?P<named>b\|p)>', gaps=True)
	['Although this is ', 'not',
	' the case here, we must not relax our vigilance!']

	Make sure that nested groups don't confuse the tokenizer:

	>>> print regexp_tokenize(s2, r'(h\|r\|l)a(s\|(i\|n0))', gaps=False)
	['las', 'has', 'rai', 'rai']
	>>> print regexp_tokenize(s2, r'(h\|r\|l)a(s\|(i\|n0))', gaps=True)
	['A', ', it ', ' not ', 'ned today. When, do you think, will it ',
	'n again?']

	The tokenizer should reject any patterns with backreferences:

	>>> print regexp_tokenize(s2, r'(.)\1')
	Traceback (most recent call last):
	...
	ValueError: Regular expressions with back-references are
	not supported: '(.)\\1'
	>>> print regexp_tokenize(s2, r'(?P<foo>)(?P=foo)')
	Traceback (most recent call last):
	...
	ValueError: Regular expressions with back-references are
	not supported: '(?P<foo>)(?P=foo)'

	A simple sentence tokenizer '\.(\s+\|$)'

	>>> print regexp_tokenize(s, pattern=r'\.(\s+\|$)', gaps=True)
	['Good muffins cost $3.88\nin New York',
	'Please buy me\ntwo of them', 'Thanks']
	# Natural Language Toolkit: Tokenizers
	#
	# Copyright (C) 2001-2012 NLTK Project
	# Author: Edward Loper <edloper@gradient.cis.upenn.edu>
	# Michael Heilman <mheilman@cmu.edu> (re-port from http://www.cis.upenn.edu/~treebank/tokenizer.sed)
	#
	# URL: <http://nltk.sourceforge.net>
	# For license information, see LICENSE.TXT

	r"""

	Penn Treebank Tokenizer

	The Treebank tokenizer uses regular expressions to tokenize text as in Penn Treebank.
	This implementation is a port of the tokenizer sed script written by Robert McIntyre
	and available at http://www.cis.upenn.edu/~treebank/tokenizer.sed.
	"""

	import re
	from nltk.tokenize.api import TokenizerI


	class TreebankWordTokenizer(TokenizerI):
	"""
	The Treebank tokenizer uses regular expressions to tokenize text as in Penn Treebank.
	This is the method that is invoked by ``word_tokenize()``. It assumes that the
	text has already been segmented into sentences, e.g. using ``sent_tokenize()``.

	This tokenizer performs the following steps:

	- split standard contractions, e.g. ``don't`` -> ``do n't`` and ``they'll`` -> ``they 'll``
	- treat most punctuation characters as separate tokens
	- split off commas and single quotes, when followed by whitespace
	- separate periods that appear at the end of line

	>>> from nltk.tokenize import TreebankWordTokenizer
	>>> s = '''Good muffins cost $3.88\\nin New York. Please buy me\\ntwo of them.\\n\\nThanks.'''
	>>> TreebankWordTokenizer().tokenize(s)
	['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York.',
	'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']
	>>> s = "They'll save and invest more."
	>>> TreebankWordTokenizer().tokenize(s)
	['They', "'ll", 'save', 'and', 'invest', 'more', '.']

	NB. this tokenizer assumes that the text is presented as one sentence per line,
	where each line is delimited with a newline character.
	The only periods to be treated as separate tokens are those appearing
	at the end of a line.
	"""

	# List of contractions adapted from Robert MacIntyre's tokenizer.
	CONTRACTIONS2 = [re.compile(r"(?i)\b(can)(not)\b"),
	re.compile(r"(?i)\b(d)('ye)\b"),
	re.compile(r"(?i)\b(gim)(me)\b"),
	re.compile(r"(?i)\b(gon)(na)\b"),
	re.compile(r"(?i)\b(got)(ta)\b"),
	re.compile(r"(?i)\b(lem)(me)\b"),
	re.compile(r"(?i)\b(mor)('n)\b"),
	re.compile(r"(?i)\b(wan)(na) ")]
	CONTRACTIONS3 = [re.compile(r"(?i) ('t)(is)\b"),
	re.compile(r"(?i) ('t)(was)\b")]
	CONTRACTIONS4 = [re.compile(r"(?i)\b(whad)(dd)(ya)\b"),
	re.compile(r"(?i)\b(wha)(t)(cha)\b")]

	def tokenize(self, text):
	#starting quotes
	text = re.sub(r'^\"', r'``', text)
	text = re.sub(r'(``)', r' \1 ', text)
	text = re.sub(r'([ (\[{<])"', r'\1 `` ', text)

	#punctuation
	text = re.sub(r'([:,])([^\d])', r' \1 \2', text)
	text = re.sub(r'\.\.\.', r' ... ', text)
	text = re.sub(r'[;@#$%&]', r' \g<0> ', text)
	text = re.sub(r'([^\.])(\.)([\]\)}>"\'])\s$', r'\1 \2\3 ', text)
	text = re.sub(r'[?!]', r' \g<0> ', text)

	text = re.sub(r"([^'])' ", r"\1 ' ", text)

	#parens, brackets, etc.
	text = re.sub(r'[\]\[\(\)\{\}\<\>]', r' \g<0> ', text)
	text = re.sub(r'--', r' -- ', text)

	#add extra space to make things easier
	text = " " + text + " "

	#ending quotes
	text = re.sub(r'"', " '' ", text)
	text = re.sub(r'(\S)(\'\')', r'\1 \2 ', text)

	text = re.sub(r"([^' ])('[sS]\|'[mM]\|'[dD]\|') ", r"\1 \2 ", text)
	text = re.sub(r"([^' ])('ll\|'re\|'ve\|n't\|) ", r"\1 \2 ", text)
	text = re.sub(r"([^' ])('LL\|'RE\|'VE\|N'T\|) ", r"\1 \2 ", text)

	for regexp in self.CONTRACTIONS2:
	text = regexp.sub(r' \1 \2 ', text)
	for regexp in self.CONTRACTIONS3:
	text = regexp.sub(r' \1 \2 ', text)

	# We are not using CONTRACTIONS4 since
	# they are also commented out in the SED scripts
	# for regexp in self.CONTRACTIONS4:
	# text = regexp.sub(r' \1 \2 \3 ', text)

	text = re.sub(" +", " ", text)
	text = text.strip()

	#add space at end to match up with MacIntyre's output (for debugging)
	if text != "":
	text += " "

	return text.split()

	if __name__ == "__main__":
	import doctest
	doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)