Skip to content

Instantly share code, notes, and snippets.

@sandsfish
Created October 16, 2015 18:50
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save sandsfish/de58318b0bae5d06c056 to your computer and use it in GitHub Desktop.
Save sandsfish/de58318b0bae5d06c056 to your computer and use it in GitHub Desktop.
# all_summary is the data here. in this case, just a lot of text records collapsed into on corpus string
# Prep data for NLTK Analysis
import nltk.collocations
tokens = nltk.word_tokenize(all_summary)
text = nltk.Text(tokens)
# Remove stop-words, convert to lower-case, remove all non-alpha characters
from nltk.corpus import stopwords
stopwords = stopwords.words('english')
text1 = nltk.Text([w.lower() for w in text if w.isalpha()])
text2 = [w for w in text1 if w not in stopwords]
# 200 Best 'Bigram Collocations' by Score - Write to File
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
from nltk import collocations
def best_bigram_word_feats(words, score_fn=BigramAssocMeasures.chi_sq, n=200, freq=5):
bigram_finder = BigramCollocationFinder.from_words(words)
bigram_finder.apply_freq_filter(freq)
bigrams = bigram_finder.nbest(score_fn, n)
with open('best_{0}_bigram_collocations_more_than_{1}_occurrences_pmi.txt'.format(n, freq), 'wb') as f:
for b in bigrams:
f.write("{0} {1}\n".format(b[0], b[1]))
return bigrams
best_bigrams = best_bigram_word_feats(text2, score_fn=BigramAssocMeasures.pmi, n=200, freq=20)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment