Skip to content

Instantly share code, notes, and snippets.

@desilinguist
Created February 14, 2012 13:56
Show Gist options
  • Save desilinguist/1826944 to your computer and use it in GitHub Desktop.
Save desilinguist/1826944 to your computer and use it in GitHub Desktop.
Latest version of treebank.py that fixes comma and colon errors when followed by numbers. Also the latest version of tokenize.doctest that tests for these errors.
.. Copyright (C) 2001-2012 NLTK Project
.. For license information, see LICENSE.TXT
>>> from nltk.tokenize import *
Regression Tests: Treebank Tokenizer
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Some test strings.
>>> s1 = "On a $50,000 mortgage of 30 years at 8 percent, the monthly payment would be $366.88."
>>> print word_tokenize(s1)
['On', 'a', '$', '50,000', 'mortgage', 'of', '30', 'years', 'at', '8', 'percent', ',', 'the', 'monthly', 'payment', 'would', 'be', '$', '366.88', '.']
>>> s2 = "\"We beat some pretty good teams to get here,\" Slocum said."
>>> print word_tokenize(s2)
['``', 'We', 'beat', 'some', 'pretty', 'good', 'teams', 'to', 'get', 'here', ',', "''", 'Slocum', 'said', '.']
>>> s3 = "Well, we couldn't have this predictable, cliche-ridden, \"Touched by an Angel\" (a show creator John Masius worked on) wanna-be if she didn't."
>>> print word_tokenize(s3)
['Well', ',', 'we', 'could', "n't", 'have', 'this', 'predictable', ',', 'cliche-ridden', ',', '``', 'Touched', 'by', 'an', 'Angel', "''", '(', 'a', 'show', 'creator', 'John', 'Masius', 'worked', 'on', ')', 'wanna-be', 'if', 'she', 'did', "n't", '.']
>>> s4 = "I cannot cannot work under these conditions!"
>>> print word_tokenize(s4)
['I', 'can', 'not', 'can', 'not', 'work', 'under', 'these', 'conditions', '!']
>>> s5 = "The company spent $30,000,000 last year."
>>> print word_tokenize(s5)
['The', 'company', 'spent', '$', '30,000,000', 'last', 'year', '.']
>>> s6 = "The company spent 40.75% of its income last year."
>>> print word_tokenize(s6)
['The', 'company', 'spent', '40.75', '%', 'of', 'its', 'income', 'last', 'year', '.']
>>> s7 = "He arrived at 3:00 pm."
>>> print word_tokenize(s7)
['He', 'arrived', 'at', '3:00', 'pm', '.']
>>> s8 = "I bought these items: books, pencils, and pens."
>>> print word_tokenize(s8)
['I', 'bought', 'these', 'items', ':', 'books', ',', 'pencils', ',', 'and', 'pens', '.']
>>> s9 = "Though there were 150, 100 of them were old."
>>> print word_tokenize(s9)
['Though', 'there', 'were', '150', ',', '100', 'of', 'them', 'were', 'old', '.']
>>> s10 = "There were 300,000, but that wasn't enough."
>>> print word_tokenize(s10)
['There', 'were', '300,000', ',', 'but', 'that', 'was', "n't", 'enough', '.']
Regression Tests: Regexp Tokenizer
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Some additional test strings.
>>> s = ("Good muffins cost $3.88\nin New York. Please buy me\n"
... "two of them.\n\nThanks.")
>>> s2 = ("Alas, it has not rained today. When, do you think, "
... "will it rain again?")
>>> s3 = ("<p>Although this is <b>not</b> the case here, we must "
... "not relax our vigilance!</p>")
>>> print regexp_tokenize(s2, r'[,\.\?!"]\s*', gaps=False)
[', ', '. ', ', ', ', ', '?']
>>> print regexp_tokenize(s2, r'[,\.\?!"]\s*', gaps=True)
['Alas', 'it has not rained today', 'When', 'do you think',
'will it rain again']
Make sure that grouping parentheses don't confuse the tokenizer:
>>> print regexp_tokenize(s3, r'</?(b|p)>', gaps=False)
['<p>', '<b>', '</b>', '</p>']
>>> print regexp_tokenize(s3, r'</?(b|p)>', gaps=True)
['Although this is ', 'not',
' the case here, we must not relax our vigilance!']
Make sure that named groups don't confuse the tokenizer:
>>> print regexp_tokenize(s3, r'</?(?P<named>b|p)>', gaps=False)
['<p>', '<b>', '</b>', '</p>']
>>> print regexp_tokenize(s3, r'</?(?P<named>b|p)>', gaps=True)
['Although this is ', 'not',
' the case here, we must not relax our vigilance!']
Make sure that nested groups don't confuse the tokenizer:
>>> print regexp_tokenize(s2, r'(h|r|l)a(s|(i|n0))', gaps=False)
['las', 'has', 'rai', 'rai']
>>> print regexp_tokenize(s2, r'(h|r|l)a(s|(i|n0))', gaps=True)
['A', ', it ', ' not ', 'ned today. When, do you think, will it ',
'n again?']
The tokenizer should reject any patterns with backreferences:
>>> print regexp_tokenize(s2, r'(.)\1')
Traceback (most recent call last):
...
ValueError: Regular expressions with back-references are
not supported: '(.)\\1'
>>> print regexp_tokenize(s2, r'(?P<foo>)(?P=foo)')
Traceback (most recent call last):
...
ValueError: Regular expressions with back-references are
not supported: '(?P<foo>)(?P=foo)'
A simple sentence tokenizer '\.(\s+|$)'
>>> print regexp_tokenize(s, pattern=r'\.(\s+|$)', gaps=True)
['Good muffins cost $3.88\nin New York',
'Please buy me\ntwo of them', 'Thanks']
# Natural Language Toolkit: Tokenizers
#
# Copyright (C) 2001-2012 NLTK Project
# Author: Edward Loper <edloper@gradient.cis.upenn.edu>
# Michael Heilman <mheilman@cmu.edu> (re-port from http://www.cis.upenn.edu/~treebank/tokenizer.sed)
#
# URL: <http://nltk.sourceforge.net>
# For license information, see LICENSE.TXT
r"""
Penn Treebank Tokenizer
The Treebank tokenizer uses regular expressions to tokenize text as in Penn Treebank.
This implementation is a port of the tokenizer sed script written by Robert McIntyre
and available at http://www.cis.upenn.edu/~treebank/tokenizer.sed.
"""
import re
from nltk.tokenize.api import TokenizerI
class TreebankWordTokenizer(TokenizerI):
"""
The Treebank tokenizer uses regular expressions to tokenize text as in Penn Treebank.
This is the method that is invoked by ``word_tokenize()``. It assumes that the
text has already been segmented into sentences, e.g. using ``sent_tokenize()``.
This tokenizer performs the following steps:
- split standard contractions, e.g. ``don't`` -> ``do n't`` and ``they'll`` -> ``they 'll``
- treat most punctuation characters as separate tokens
- split off commas and single quotes, when followed by whitespace
- separate periods that appear at the end of line
>>> from nltk.tokenize import TreebankWordTokenizer
>>> s = '''Good muffins cost $3.88\\nin New York. Please buy me\\ntwo of them.\\n\\nThanks.'''
>>> TreebankWordTokenizer().tokenize(s)
['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York.',
'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']
>>> s = "They'll save and invest more."
>>> TreebankWordTokenizer().tokenize(s)
['They', "'ll", 'save', 'and', 'invest', 'more', '.']
NB. this tokenizer assumes that the text is presented as one sentence per line,
where each line is delimited with a newline character.
The only periods to be treated as separate tokens are those appearing
at the end of a line.
"""
# List of contractions adapted from Robert MacIntyre's tokenizer.
CONTRACTIONS2 = [re.compile(r"(?i)\b(can)(not)\b"),
re.compile(r"(?i)\b(d)('ye)\b"),
re.compile(r"(?i)\b(gim)(me)\b"),
re.compile(r"(?i)\b(gon)(na)\b"),
re.compile(r"(?i)\b(got)(ta)\b"),
re.compile(r"(?i)\b(lem)(me)\b"),
re.compile(r"(?i)\b(mor)('n)\b"),
re.compile(r"(?i)\b(wan)(na) ")]
CONTRACTIONS3 = [re.compile(r"(?i) ('t)(is)\b"),
re.compile(r"(?i) ('t)(was)\b")]
CONTRACTIONS4 = [re.compile(r"(?i)\b(whad)(dd)(ya)\b"),
re.compile(r"(?i)\b(wha)(t)(cha)\b")]
def tokenize(self, text):
#starting quotes
text = re.sub(r'^\"', r'``', text)
text = re.sub(r'(``)', r' \1 ', text)
text = re.sub(r'([ (\[{<])"', r'\1 `` ', text)
#punctuation
text = re.sub(r'([:,])([^\d])', r' \1 \2', text)
text = re.sub(r'\.\.\.', r' ... ', text)
text = re.sub(r'[;@#$%&]', r' \g<0> ', text)
text = re.sub(r'([^\.])(\.)([\]\)}>"\']*)\s*$', r'\1 \2\3 ', text)
text = re.sub(r'[?!]', r' \g<0> ', text)
text = re.sub(r"([^'])' ", r"\1 ' ", text)
#parens, brackets, etc.
text = re.sub(r'[\]\[\(\)\{\}\<\>]', r' \g<0> ', text)
text = re.sub(r'--', r' -- ', text)
#add extra space to make things easier
text = " " + text + " "
#ending quotes
text = re.sub(r'"', " '' ", text)
text = re.sub(r'(\S)(\'\')', r'\1 \2 ', text)
text = re.sub(r"([^' ])('[sS]|'[mM]|'[dD]|') ", r"\1 \2 ", text)
text = re.sub(r"([^' ])('ll|'re|'ve|n't|) ", r"\1 \2 ", text)
text = re.sub(r"([^' ])('LL|'RE|'VE|N'T|) ", r"\1 \2 ", text)
for regexp in self.CONTRACTIONS2:
text = regexp.sub(r' \1 \2 ', text)
for regexp in self.CONTRACTIONS3:
text = regexp.sub(r' \1 \2 ', text)
# We are not using CONTRACTIONS4 since
# they are also commented out in the SED scripts
# for regexp in self.CONTRACTIONS4:
# text = regexp.sub(r' \1 \2 \3 ', text)
text = re.sub(" +", " ", text)
text = text.strip()
#add space at end to match up with MacIntyre's output (for debugging)
if text != "":
text += " "
return text.split()
if __name__ == "__main__":
import doctest
doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment