Skip to content

Instantly share code, notes, and snippets.

# all_summary is the data here. in this case, just a lot of text records collapsed into on corpus string
# Prep data for NLTK Analysis
import nltk.collocations
tokens = nltk.word_tokenize(all_summary)
text = nltk.Text(tokens)
# Remove stop-words, convert to lower-case, remove all non-alpha characters
from nltk.corpus import stopwords
stopwords = stopwords.words('english')