Last active
December 16, 2015 03:18
-
-
Save bdewilde/5368387 to your computer and use it in GitHub Desktop.
tokenize document into sentences of normalized+filtered words
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def tokenize_and_normalize_doc(doc, filter_stopwords=True, normalize='lemma'): | |
import nltk.corpus | |
from nltk.stem import PorterStemmer, WordNetLemmatizer | |
from nltk.tokenize import sent_tokenize, word_tokenize, wordpunct_tokenize | |
from string import punctuation | |
# use NLTK's default set of english stop words | |
stops_list = nltk.corpus.stopwords.words('english') | |
if normalize == 'lemma': | |
# lemmatize with WordNet | |
normalizer = WordNetLemmatizer() | |
elif normalize == 'stem': | |
# stem with Porter | |
normalizer = PorterStemmer() | |
# tokenize the document into sentences with NLTK default | |
sents = sent_tokenize(doc) | |
# tokenize each sentence into words with NLTK default | |
tokenized_sents = [wordpunct_tokenize(sent) for sent in sents] | |
# filter out "bad" words, normalize good ones | |
normalized_sents = [] | |
for tokenized_sent in tokenized_sents: | |
good_words = [word for word in tokenized_sent | |
# filter out too-long words | |
if len(word) < 25 | |
# filter out bare punctuation | |
if word not in list(punctuation)] | |
if filter_stopwords is True: | |
good_words = [word for word in good_words | |
# filter out stop words | |
if word not in stops_list] | |
if normalize == 'lemma': | |
normalized_sents.append([normalizer.lemmatize(word) for word in good_words]) | |
elif normalize == 'stem': | |
normalized_sents.append([normalizer.stem(word) for word in good_words]) | |
else: | |
normalized_sents.append([word for word in good_words]) | |
return normalized_sents |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment