Created
February 14, 2012 13:56
-
-
Save desilinguist/1826944 to your computer and use it in GitHub Desktop.
Latest version of treebank.py that fixes comma and colon errors when followed by numbers. Also the latest version of tokenize.doctest that tests for these errors.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
.. Copyright (C) 2001-2012 NLTK Project | |
.. For license information, see LICENSE.TXT | |
>>> from nltk.tokenize import * | |
Regression Tests: Treebank Tokenizer | |
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | |
Some test strings. | |
>>> s1 = "On a $50,000 mortgage of 30 years at 8 percent, the monthly payment would be $366.88." | |
>>> print word_tokenize(s1) | |
['On', 'a', '$', '50,000', 'mortgage', 'of', '30', 'years', 'at', '8', 'percent', ',', 'the', 'monthly', 'payment', 'would', 'be', '$', '366.88', '.'] | |
>>> s2 = "\"We beat some pretty good teams to get here,\" Slocum said." | |
>>> print word_tokenize(s2) | |
['``', 'We', 'beat', 'some', 'pretty', 'good', 'teams', 'to', 'get', 'here', ',', "''", 'Slocum', 'said', '.'] | |
>>> s3 = "Well, we couldn't have this predictable, cliche-ridden, \"Touched by an Angel\" (a show creator John Masius worked on) wanna-be if she didn't." | |
>>> print word_tokenize(s3) | |
['Well', ',', 'we', 'could', "n't", 'have', 'this', 'predictable', ',', 'cliche-ridden', ',', '``', 'Touched', 'by', 'an', 'Angel', "''", '(', 'a', 'show', 'creator', 'John', 'Masius', 'worked', 'on', ')', 'wanna-be', 'if', 'she', 'did', "n't", '.'] | |
>>> s4 = "I cannot cannot work under these conditions!" | |
>>> print word_tokenize(s4) | |
['I', 'can', 'not', 'can', 'not', 'work', 'under', 'these', 'conditions', '!'] | |
>>> s5 = "The company spent $30,000,000 last year." | |
>>> print word_tokenize(s5) | |
['The', 'company', 'spent', '$', '30,000,000', 'last', 'year', '.'] | |
>>> s6 = "The company spent 40.75% of its income last year." | |
>>> print word_tokenize(s6) | |
['The', 'company', 'spent', '40.75', '%', 'of', 'its', 'income', 'last', 'year', '.'] | |
>>> s7 = "He arrived at 3:00 pm." | |
>>> print word_tokenize(s7) | |
['He', 'arrived', 'at', '3:00', 'pm', '.'] | |
>>> s8 = "I bought these items: books, pencils, and pens." | |
>>> print word_tokenize(s8) | |
['I', 'bought', 'these', 'items', ':', 'books', ',', 'pencils', ',', 'and', 'pens', '.'] | |
>>> s9 = "Though there were 150, 100 of them were old." | |
>>> print word_tokenize(s9) | |
['Though', 'there', 'were', '150', ',', '100', 'of', 'them', 'were', 'old', '.'] | |
>>> s10 = "There were 300,000, but that wasn't enough." | |
>>> print word_tokenize(s10) | |
['There', 'were', '300,000', ',', 'but', 'that', 'was', "n't", 'enough', '.'] | |
Regression Tests: Regexp Tokenizer | |
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | |
Some additional test strings. | |
>>> s = ("Good muffins cost $3.88\nin New York. Please buy me\n" | |
... "two of them.\n\nThanks.") | |
>>> s2 = ("Alas, it has not rained today. When, do you think, " | |
... "will it rain again?") | |
>>> s3 = ("<p>Although this is <b>not</b> the case here, we must " | |
... "not relax our vigilance!</p>") | |
>>> print regexp_tokenize(s2, r'[,\.\?!"]\s*', gaps=False) | |
[', ', '. ', ', ', ', ', '?'] | |
>>> print regexp_tokenize(s2, r'[,\.\?!"]\s*', gaps=True) | |
['Alas', 'it has not rained today', 'When', 'do you think', | |
'will it rain again'] | |
Make sure that grouping parentheses don't confuse the tokenizer: | |
>>> print regexp_tokenize(s3, r'</?(b|p)>', gaps=False) | |
['<p>', '<b>', '</b>', '</p>'] | |
>>> print regexp_tokenize(s3, r'</?(b|p)>', gaps=True) | |
['Although this is ', 'not', | |
' the case here, we must not relax our vigilance!'] | |
Make sure that named groups don't confuse the tokenizer: | |
>>> print regexp_tokenize(s3, r'</?(?P<named>b|p)>', gaps=False) | |
['<p>', '<b>', '</b>', '</p>'] | |
>>> print regexp_tokenize(s3, r'</?(?P<named>b|p)>', gaps=True) | |
['Although this is ', 'not', | |
' the case here, we must not relax our vigilance!'] | |
Make sure that nested groups don't confuse the tokenizer: | |
>>> print regexp_tokenize(s2, r'(h|r|l)a(s|(i|n0))', gaps=False) | |
['las', 'has', 'rai', 'rai'] | |
>>> print regexp_tokenize(s2, r'(h|r|l)a(s|(i|n0))', gaps=True) | |
['A', ', it ', ' not ', 'ned today. When, do you think, will it ', | |
'n again?'] | |
The tokenizer should reject any patterns with backreferences: | |
>>> print regexp_tokenize(s2, r'(.)\1') | |
Traceback (most recent call last): | |
... | |
ValueError: Regular expressions with back-references are | |
not supported: '(.)\\1' | |
>>> print regexp_tokenize(s2, r'(?P<foo>)(?P=foo)') | |
Traceback (most recent call last): | |
... | |
ValueError: Regular expressions with back-references are | |
not supported: '(?P<foo>)(?P=foo)' | |
A simple sentence tokenizer '\.(\s+|$)' | |
>>> print regexp_tokenize(s, pattern=r'\.(\s+|$)', gaps=True) | |
['Good muffins cost $3.88\nin New York', | |
'Please buy me\ntwo of them', 'Thanks'] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Natural Language Toolkit: Tokenizers | |
# | |
# Copyright (C) 2001-2012 NLTK Project | |
# Author: Edward Loper <edloper@gradient.cis.upenn.edu> | |
# Michael Heilman <mheilman@cmu.edu> (re-port from http://www.cis.upenn.edu/~treebank/tokenizer.sed) | |
# | |
# URL: <http://nltk.sourceforge.net> | |
# For license information, see LICENSE.TXT | |
r""" | |
Penn Treebank Tokenizer | |
The Treebank tokenizer uses regular expressions to tokenize text as in Penn Treebank. | |
This implementation is a port of the tokenizer sed script written by Robert McIntyre | |
and available at http://www.cis.upenn.edu/~treebank/tokenizer.sed. | |
""" | |
import re | |
from nltk.tokenize.api import TokenizerI | |
class TreebankWordTokenizer(TokenizerI): | |
""" | |
The Treebank tokenizer uses regular expressions to tokenize text as in Penn Treebank. | |
This is the method that is invoked by ``word_tokenize()``. It assumes that the | |
text has already been segmented into sentences, e.g. using ``sent_tokenize()``. | |
This tokenizer performs the following steps: | |
- split standard contractions, e.g. ``don't`` -> ``do n't`` and ``they'll`` -> ``they 'll`` | |
- treat most punctuation characters as separate tokens | |
- split off commas and single quotes, when followed by whitespace | |
- separate periods that appear at the end of line | |
>>> from nltk.tokenize import TreebankWordTokenizer | |
>>> s = '''Good muffins cost $3.88\\nin New York. Please buy me\\ntwo of them.\\n\\nThanks.''' | |
>>> TreebankWordTokenizer().tokenize(s) | |
['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York.', | |
'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.'] | |
>>> s = "They'll save and invest more." | |
>>> TreebankWordTokenizer().tokenize(s) | |
['They', "'ll", 'save', 'and', 'invest', 'more', '.'] | |
NB. this tokenizer assumes that the text is presented as one sentence per line, | |
where each line is delimited with a newline character. | |
The only periods to be treated as separate tokens are those appearing | |
at the end of a line. | |
""" | |
# List of contractions adapted from Robert MacIntyre's tokenizer. | |
CONTRACTIONS2 = [re.compile(r"(?i)\b(can)(not)\b"), | |
re.compile(r"(?i)\b(d)('ye)\b"), | |
re.compile(r"(?i)\b(gim)(me)\b"), | |
re.compile(r"(?i)\b(gon)(na)\b"), | |
re.compile(r"(?i)\b(got)(ta)\b"), | |
re.compile(r"(?i)\b(lem)(me)\b"), | |
re.compile(r"(?i)\b(mor)('n)\b"), | |
re.compile(r"(?i)\b(wan)(na) ")] | |
CONTRACTIONS3 = [re.compile(r"(?i) ('t)(is)\b"), | |
re.compile(r"(?i) ('t)(was)\b")] | |
CONTRACTIONS4 = [re.compile(r"(?i)\b(whad)(dd)(ya)\b"), | |
re.compile(r"(?i)\b(wha)(t)(cha)\b")] | |
def tokenize(self, text): | |
#starting quotes | |
text = re.sub(r'^\"', r'``', text) | |
text = re.sub(r'(``)', r' \1 ', text) | |
text = re.sub(r'([ (\[{<])"', r'\1 `` ', text) | |
#punctuation | |
text = re.sub(r'([:,])([^\d])', r' \1 \2', text) | |
text = re.sub(r'\.\.\.', r' ... ', text) | |
text = re.sub(r'[;@#$%&]', r' \g<0> ', text) | |
text = re.sub(r'([^\.])(\.)([\]\)}>"\']*)\s*$', r'\1 \2\3 ', text) | |
text = re.sub(r'[?!]', r' \g<0> ', text) | |
text = re.sub(r"([^'])' ", r"\1 ' ", text) | |
#parens, brackets, etc. | |
text = re.sub(r'[\]\[\(\)\{\}\<\>]', r' \g<0> ', text) | |
text = re.sub(r'--', r' -- ', text) | |
#add extra space to make things easier | |
text = " " + text + " " | |
#ending quotes | |
text = re.sub(r'"', " '' ", text) | |
text = re.sub(r'(\S)(\'\')', r'\1 \2 ', text) | |
text = re.sub(r"([^' ])('[sS]|'[mM]|'[dD]|') ", r"\1 \2 ", text) | |
text = re.sub(r"([^' ])('ll|'re|'ve|n't|) ", r"\1 \2 ", text) | |
text = re.sub(r"([^' ])('LL|'RE|'VE|N'T|) ", r"\1 \2 ", text) | |
for regexp in self.CONTRACTIONS2: | |
text = regexp.sub(r' \1 \2 ', text) | |
for regexp in self.CONTRACTIONS3: | |
text = regexp.sub(r' \1 \2 ', text) | |
# We are not using CONTRACTIONS4 since | |
# they are also commented out in the SED scripts | |
# for regexp in self.CONTRACTIONS4: | |
# text = regexp.sub(r' \1 \2 \3 ', text) | |
text = re.sub(" +", " ", text) | |
text = text.strip() | |
#add space at end to match up with MacIntyre's output (for debugging) | |
if text != "": | |
text += " " | |
return text.split() | |
if __name__ == "__main__": | |
import doctest | |
doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment