Skip to content

Instantly share code, notes, and snippets.

@ckoppelman
Created April 28, 2017 17:42
Show Gist options
  • Star 4 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ckoppelman/c93e4192d9f189fba590e095258f8f33 to your computer and use it in GitHub Desktop.
Save ckoppelman/c93e4192d9f189fba590e095258f8f33 to your computer and use it in GitHub Desktop.
import nltk
from nltk.tokenize.treebank import TreebankWordTokenizer
class TreebankSpanTokenizer(TreebankWordTokenizer):
def __init__(self):
self._word_tokenizer = TreebankWordTokenizer()
def span_tokenize(self, text):
ix = 0
for word_token in self.tokenize(text):
ix = text.find(word_token, ix)
end = ix+len(word_token)
yield (ix, end)
ix = end
def tokenize(self, text):
return self._word_tokenizer.tokenize(text);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment