Skip to content

Instantly share code, notes, and snippets.

@bdewilde
Last active December 16, 2015 03:18
Show Gist options
  • Save bdewilde/5368387 to your computer and use it in GitHub Desktop.
Save bdewilde/5368387 to your computer and use it in GitHub Desktop.
tokenize document into sentences of normalized+filtered words
def tokenize_and_normalize_doc(doc, filter_stopwords=True, normalize='lemma'):
import nltk.corpus
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize, wordpunct_tokenize
from string import punctuation
# use NLTK's default set of english stop words
stops_list = nltk.corpus.stopwords.words('english')
if normalize == 'lemma':
# lemmatize with WordNet
normalizer = WordNetLemmatizer()
elif normalize == 'stem':
# stem with Porter
normalizer = PorterStemmer()
# tokenize the document into sentences with NLTK default
sents = sent_tokenize(doc)
# tokenize each sentence into words with NLTK default
tokenized_sents = [wordpunct_tokenize(sent) for sent in sents]
# filter out "bad" words, normalize good ones
normalized_sents = []
for tokenized_sent in tokenized_sents:
good_words = [word for word in tokenized_sent
# filter out too-long words
if len(word) < 25
# filter out bare punctuation
if word not in list(punctuation)]
if filter_stopwords is True:
good_words = [word for word in good_words
# filter out stop words
if word not in stops_list]
if normalize == 'lemma':
normalized_sents.append([normalizer.lemmatize(word) for word in good_words])
elif normalize == 'stem':
normalized_sents.append([normalizer.stem(word) for word in good_words])
else:
normalized_sents.append([word for word in good_words])
return normalized_sents
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment