desilinguist/tokenize.doctest

## tokenize.doctest
.. Copyright (C) 2001-2012 NLTK Project
.. For license information, see LICENSE.TXT

    >>> from nltk.tokenize import *

Regression Tests: Treebank Tokenizer
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Some test strings.

    >>> s1 = "On a $50,000 mortgage of 30 years at 8 percent, the monthly payment would be $366.88."
    >>> print word_tokenize(s1)
    ['On', 'a', '$', '50', ',', '000', 'mortgage', 'of', '30', 'years', 'at', '8', 'percent', ',', 'the', 'monthly', 'payment', 'would', 'be', '$', '366.88', '.']
    >>> s2 = "\"We beat some pretty good teams to get here,\" Slocum said."
    >>> print word_tokenize(s2)
    ['``', 'We', 'beat', 'some', 'pretty', 'good', 'teams', 'to', 'get', 'here', ',', "''", 'Slocum', 'said', '.']
    >>> s3 = "Well, we couldn't have this predictable, cliche-ridden, \"Touched by an Angel\" (a show creator John Masius worked on) wanna-be if she didn't."
    >>> print word_tokenize(s3)
    ['Well', ',', 'we', 'could', "n't", 'have', 'this', 'predictable', ',', 'cliche-ridden', ',', '``', 'Touched', 'by', 'an', 'Angel', "''", '(', 'a', 'show', 'creator', 'John', 'Masius', 'worked', 'on', ')', 'wanna-be', 'if', 'she', 'did', "n't", '.']
    >>> s4 = "I cannot cannot work under these conditions!"
    >>> print word_tokenize(s4)
    ['I', 'can', 'not', 'can', 'not', 'work', 'under', 'these', 'conditions', '!']


Regression Tests: Regexp Tokenizer
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Some additional test strings.

    >>> s = ("Good muffins cost $3.88\nin New York.  Please buy me\n"
    ...      "two of them.\n\nThanks.")
    >>> s2 = ("Alas, it has not rained today. When, do you think, "
    ...       "will it rain again?")
    >>> s3 = ("<p>Although this is <b>not</b> the case here, we must "
    ...       "not relax our vigilance!</p>")

    >>> print regexp_tokenize(s2, r'[,\.\?!"]\s*', gaps=False)
    [', ', '. ', ', ', ', ', '?']
    >>> print regexp_tokenize(s2, r'[,\.\?!"]\s*', gaps=True)
    ['Alas', 'it has not rained today', 'When', 'do you think',
     'will it rain again']

Make sure that grouping parentheses don't confuse the tokenizer:

    >>> print regexp_tokenize(s3, r'</?(b|p)>', gaps=False)
    ['<p>', '<b>', '</b>', '</p>']
    >>> print regexp_tokenize(s3, r'</?(b|p)>', gaps=True)
    ['Although this is ', 'not',
     ' the case here, we must not relax our vigilance!']

Make sure that named groups don't confuse the tokenizer:

    >>> print regexp_tokenize(s3, r'</?(?P<named>b|p)>', gaps=False)
    ['<p>', '<b>', '</b>', '</p>']
    >>> print regexp_tokenize(s3, r'</?(?P<named>b|p)>', gaps=True)
    ['Although this is ', 'not',
     ' the case here, we must not relax our vigilance!']

Make sure that nested groups don't confuse the tokenizer:

    >>> print regexp_tokenize(s2, r'(h|r|l)a(s|(i|n0))', gaps=False)
    ['las', 'has', 'rai', 'rai']
    >>> print regexp_tokenize(s2, r'(h|r|l)a(s|(i|n0))', gaps=True)
    ['A', ', it ', ' not ', 'ned today. When, do you think, will it ',
     'n again?']

The tokenizer should reject any patterns with backreferences:

    >>> print regexp_tokenize(s2, r'(.)\1')
    Traceback (most recent call last):
       ...
    ValueError: Regular expressions with back-references are
    not supported: '(.)\\1'
    >>> print regexp_tokenize(s2, r'(?P<foo>)(?P=foo)')
    Traceback (most recent call last):
       ...
    ValueError: Regular expressions with back-references are
    not supported: '(?P<foo>)(?P=foo)'

A simple sentence tokenizer '\.(\s+|$)'

    >>> print regexp_tokenize(s, pattern=r'\.(\s+|$)', gaps=True)
    ['Good muffins cost $3.88\nin New York',
     'Please buy me\ntwo of them', 'Thanks']

## treebank2-heilman.py
r"""

Penn Treebank Tokenizer

The Treebank tokenizer uses regular expressions to tokenize text as in Penn Treebank.
This implementation is a port of the tokenizer sed script written by Robert McIntyer
and available at http://www.cis.upenn.edu/~treebank/tokenizer.sed.
This is the method that is invoked by ``word_tokenize()``.  It assumes that the
text has already been segmented into sentences, e.g. using ``sent_tokenize()``.

This tokenizer performs the following steps:

  - split standard contractions, e.g. ``don't`` -> ``do n't`` and ``they'll`` -> ``they 'll``
  - treat most punctuation characters as separate tokens
  - split off commas and single quotes, when followed by whitespace
  - separate periods that appear at the end of line

    >>> from nltk.tokenize import TreebankWordTokenizer
    >>> s = "Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\n\nThanks."
    >>> TreebankWordTokenizer().tokenize(s)
    ['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York.',
    'Please', 'buy', 'me', 'two', 'of', 'them.', 'Thanks', '.']
    >>> s = "They'll save and invest more."
    ['They', "'ll", 'save', 'and', 'invest', 'more', '.']

"""

import re
from api import *


class TreebankWordTokenizer(TokenizerI):
    # List of contractions adapted from Robert MacIntyre's tokenizer.
    CONTRACTIONS2 = [re.compile(r"\b(can)(not)\b", flags=re.IGNORECASE),
                     re.compile(r"\b(d)('ye)\b", flags=re.IGNORECASE),
                     re.compile(r"\b(gim)(me)\b", flags=re.IGNORECASE),
                     re.compile(r"\b(gon)(na)\b", flags=re.IGNORECASE),
                     re.compile(r"\b(got)(ta)\b", flags=re.IGNORECASE),
                     re.compile(r"\b(lem)(me)\b", flags=re.IGNORECASE),
                     re.compile(r"\b(mor)('n)\b", flags=re.IGNORECASE),
                     re.compile(r"\b(wan)(na) ", flags=re.IGNORECASE)]
    CONTRACTIONS3 = [re.compile(r" ('t)(is)\b", flags=re.IGNORECASE),
                     re.compile(r" ('t)(was)\b", flags=re.IGNORECASE)]
    CONTRACTIONS4 = [re.compile(r"\b(whad)(dd)(ya)\b", flags=re.IGNORECASE),
                     re.compile(r"\b(wha)(t)(cha)\b", flags=re.IGNORECASE)]

    def tokenize(self, text):
        #starting quotes
        text = re.sub(r'^\"', r'``', text)
        text = re.sub(r'(``)', r' \1 ', text)
        text = re.sub(r'([ (\[{<])"', r'\1 `` ', text)

        #punctuation
        text = re.sub(r'\.\.\.', r' ... ', text)
        text = re.sub(r'[,;:@#$%&]', r' \g<0> ', text)
        text = re.sub(r'([^\.])(\.)([\]\)}>"\']*)\s*$', r'\1 \2\3 ', text)
        text = re.sub(r'[?!]', r' \g<0> ', text)

        text = re.sub(r"([^'])' ", r"\1 ' ", text)

        #parens, brackets, etc.
        text = re.sub(r'[\]\[\(\)\{\}\<\>]', r' \g<0> ', text)
        text = re.sub(r'--', r' -- ', text)

        #add extra space to make things easier
        text = " " + text + " "

        #ending quotes
        text = re.sub(r'"', " '' ", text)
        text = re.sub(r'(\S)(\'\')', r'\1 \2 ', text)

        text = re.sub(r"([^' ])('[sS]|'[mM]|'[dD]|') ", r"\1 \2 ", text)
        text = re.sub(r"([^' ])('ll|'re|'ve|n't|) ", r"\1 \2 ", text)
        text = re.sub(r"([^' ])('LL|'RE|'VE|N'T|) ", r"\1 \2 ", text)

        for regexp in self.CONTRACTIONS2:
            text = regexp.sub(r' \1 \2 ', text)
        for regexp in self.CONTRACTIONS3:
            text = regexp.sub(r' \1 \2 ', text)

        # We are not using CONTRACTIONS4 since
        # they are also commented out in the SED scripts
        # for regexp in self.CONTRACTIONS4:
        #     text = regexp.sub(r' \1 \2 \3 ', text)

        text = re.sub(" +", " ", text)
        text = text.strip()

        #add space at end to match up with MacIntyre's output (for debugging)
        if text != "":
            text += " "

        return text.split()

#if __name__ == "__main__":
#    import sys
#    t = TreebankWordTokenizer()
#    for line in sys.stdin:
#        line = line.strip()
#        print t.tokenize(line)
	.. Copyright (C) 2001-2012 NLTK Project
	.. For license information, see LICENSE.TXT

	>>> from nltk.tokenize import *

	Regression Tests: Treebank Tokenizer
	~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

	Some test strings.

	>>> s1 = "On a $50,000 mortgage of 30 years at 8 percent, the monthly payment would be $366.88."
	>>> print word_tokenize(s1)
	['On', 'a', '$', '50', ',', '000', 'mortgage', 'of', '30', 'years', 'at', '8', 'percent', ',', 'the', 'monthly', 'payment', 'would', 'be', '$', '366.88', '.']
	>>> s2 = "\"We beat some pretty good teams to get here,\" Slocum said."
	>>> print word_tokenize(s2)
	['``', 'We', 'beat', 'some', 'pretty', 'good', 'teams', 'to', 'get', 'here', ',', "''", 'Slocum', 'said', '.']
	>>> s3 = "Well, we couldn't have this predictable, cliche-ridden, \"Touched by an Angel\" (a show creator John Masius worked on) wanna-be if she didn't."
	>>> print word_tokenize(s3)
	['Well', ',', 'we', 'could', "n't", 'have', 'this', 'predictable', ',', 'cliche-ridden', ',', '``', 'Touched', 'by', 'an', 'Angel', "''", '(', 'a', 'show', 'creator', 'John', 'Masius', 'worked', 'on', ')', 'wanna-be', 'if', 'she', 'did', "n't", '.']
	>>> s4 = "I cannot cannot work under these conditions!"
	>>> print word_tokenize(s4)
	['I', 'can', 'not', 'can', 'not', 'work', 'under', 'these', 'conditions', '!']


	Regression Tests: Regexp Tokenizer
	~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

	Some additional test strings.

	>>> s = ("Good muffins cost $3.88\nin New York. Please buy me\n"
	... "two of them.\n\nThanks.")
	>>> s2 = ("Alas, it has not rained today. When, do you think, "
	... "will it rain again?")
	>>> s3 = ("<p>Although this is <b>not</b> the case here, we must "
	... "not relax our vigilance!</p>")

	>>> print regexp_tokenize(s2, r'[,\.\?!"]\s*', gaps=False)
	[', ', '. ', ', ', ', ', '?']
	>>> print regexp_tokenize(s2, r'[,\.\?!"]\s*', gaps=True)
	['Alas', 'it has not rained today', 'When', 'do you think',
	'will it rain again']

	Make sure that grouping parentheses don't confuse the tokenizer:

	>>> print regexp_tokenize(s3, r'</?(b\|p)>', gaps=False)
	['<p>', '<b>', '</b>', '</p>']
	>>> print regexp_tokenize(s3, r'</?(b\|p)>', gaps=True)
	['Although this is ', 'not',
	' the case here, we must not relax our vigilance!']

	Make sure that named groups don't confuse the tokenizer:

	>>> print regexp_tokenize(s3, r'</?(?P<named>b\|p)>', gaps=False)
	['<p>', '<b>', '</b>', '</p>']
	>>> print regexp_tokenize(s3, r'</?(?P<named>b\|p)>', gaps=True)
	['Although this is ', 'not',
	' the case here, we must not relax our vigilance!']

	Make sure that nested groups don't confuse the tokenizer:

	>>> print regexp_tokenize(s2, r'(h\|r\|l)a(s\|(i\|n0))', gaps=False)
	['las', 'has', 'rai', 'rai']
	>>> print regexp_tokenize(s2, r'(h\|r\|l)a(s\|(i\|n0))', gaps=True)
	['A', ', it ', ' not ', 'ned today. When, do you think, will it ',
	'n again?']

	The tokenizer should reject any patterns with backreferences:

	>>> print regexp_tokenize(s2, r'(.)\1')
	Traceback (most recent call last):
	...
	ValueError: Regular expressions with back-references are
	not supported: '(.)\\1'
	>>> print regexp_tokenize(s2, r'(?P<foo>)(?P=foo)')
	Traceback (most recent call last):
	...
	ValueError: Regular expressions with back-references are
	not supported: '(?P<foo>)(?P=foo)'

	A simple sentence tokenizer '\.(\s+\|$)'

	>>> print regexp_tokenize(s, pattern=r'\.(\s+\|$)', gaps=True)
	['Good muffins cost $3.88\nin New York',
	'Please buy me\ntwo of them', 'Thanks']
	r"""

	Penn Treebank Tokenizer

	The Treebank tokenizer uses regular expressions to tokenize text as in Penn Treebank.
	This implementation is a port of the tokenizer sed script written by Robert McIntyer
	and available at http://www.cis.upenn.edu/~treebank/tokenizer.sed.
	This is the method that is invoked by ``word_tokenize()``. It assumes that the
	text has already been segmented into sentences, e.g. using ``sent_tokenize()``.

	This tokenizer performs the following steps:

	- split standard contractions, e.g. ``don't`` -> ``do n't`` and ``they'll`` -> ``they 'll``
	- treat most punctuation characters as separate tokens
	- split off commas and single quotes, when followed by whitespace
	- separate periods that appear at the end of line

	>>> from nltk.tokenize import TreebankWordTokenizer
	>>> s = "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\n\nThanks."
	>>> TreebankWordTokenizer().tokenize(s)
	['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York.',
	'Please', 'buy', 'me', 'two', 'of', 'them.', 'Thanks', '.']
	>>> s = "They'll save and invest more."
	['They', "'ll", 'save', 'and', 'invest', 'more', '.']

	"""

	import re
	from api import *


	class TreebankWordTokenizer(TokenizerI):
	# List of contractions adapted from Robert MacIntyre's tokenizer.
	CONTRACTIONS2 = [re.compile(r"\b(can)(not)\b", flags=re.IGNORECASE),
	re.compile(r"\b(d)('ye)\b", flags=re.IGNORECASE),
	re.compile(r"\b(gim)(me)\b", flags=re.IGNORECASE),
	re.compile(r"\b(gon)(na)\b", flags=re.IGNORECASE),
	re.compile(r"\b(got)(ta)\b", flags=re.IGNORECASE),
	re.compile(r"\b(lem)(me)\b", flags=re.IGNORECASE),
	re.compile(r"\b(mor)('n)\b", flags=re.IGNORECASE),
	re.compile(r"\b(wan)(na) ", flags=re.IGNORECASE)]
	CONTRACTIONS3 = [re.compile(r" ('t)(is)\b", flags=re.IGNORECASE),
	re.compile(r" ('t)(was)\b", flags=re.IGNORECASE)]
	CONTRACTIONS4 = [re.compile(r"\b(whad)(dd)(ya)\b", flags=re.IGNORECASE),
	re.compile(r"\b(wha)(t)(cha)\b", flags=re.IGNORECASE)]

	def tokenize(self, text):
	#starting quotes
	text = re.sub(r'^\"', r'``', text)
	text = re.sub(r'(``)', r' \1 ', text)
	text = re.sub(r'([ (\[{<])"', r'\1 `` ', text)

	#punctuation
	text = re.sub(r'\.\.\.', r' ... ', text)
	text = re.sub(r'[,;:@#$%&]', r' \g<0> ', text)
	text = re.sub(r'([^\.])(\.)([\]\)}>"\'])\s$', r'\1 \2\3 ', text)
	text = re.sub(r'[?!]', r' \g<0> ', text)

	text = re.sub(r"([^'])' ", r"\1 ' ", text)

	#parens, brackets, etc.
	text = re.sub(r'[\]\[\(\)\{\}\<\>]', r' \g<0> ', text)
	text = re.sub(r'--', r' -- ', text)

	#add extra space to make things easier
	text = " " + text + " "

	#ending quotes
	text = re.sub(r'"', " '' ", text)
	text = re.sub(r'(\S)(\'\')', r'\1 \2 ', text)

	text = re.sub(r"([^' ])('[sS]\|'[mM]\|'[dD]\|') ", r"\1 \2 ", text)
	text = re.sub(r"([^' ])('ll\|'re\|'ve\|n't\|) ", r"\1 \2 ", text)
	text = re.sub(r"([^' ])('LL\|'RE\|'VE\|N'T\|) ", r"\1 \2 ", text)

	for regexp in self.CONTRACTIONS2:
	text = regexp.sub(r' \1 \2 ', text)
	for regexp in self.CONTRACTIONS3:
	text = regexp.sub(r' \1 \2 ', text)

	# We are not using CONTRACTIONS4 since
	# they are also commented out in the SED scripts
	# for regexp in self.CONTRACTIONS4:
	# text = regexp.sub(r' \1 \2 \3 ', text)

	text = re.sub(" +", " ", text)
	text = text.strip()

	#add space at end to match up with MacIntyre's output (for debugging)
	if text != "":
	text += " "

	return text.split()

	#if __name__ == "__main__":
	# import sys
	# t = TreebankWordTokenizer()
	# for line in sys.stdin:
	# line = line.strip()
	# print t.tokenize(line)