This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# all_summary is the data here. in this case, just a lot of text records collapsed into on corpus string | |
# Prep data for NLTK Analysis | |
import nltk.collocations | |
tokens = nltk.word_tokenize(all_summary) | |
text = nltk.Text(tokens) | |
# Remove stop-words, convert to lower-case, remove all non-alpha characters | |
from nltk.corpus import stopwords | |
stopwords = stopwords.words('english') |
OlderNewer