sandsfish/ngram-analysis.py

## ngram-analysis.py
# all_summary is the data here. in this case, just a lot of text records collapsed into on corpus string

# Prep data for NLTK Analysis
import nltk.collocations
tokens = nltk.word_tokenize(all_summary)
text = nltk.Text(tokens)

# Remove stop-words, convert to lower-case, remove all non-alpha characters
from nltk.corpus import stopwords
stopwords = stopwords.words('english')

text1 = nltk.Text([w.lower() for w in text if w.isalpha()])
text2 = [w for w in text1 if w not in stopwords]

# 200 Best 'Bigram Collocations' by Score - Write to File

from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
from nltk import collocations

def best_bigram_word_feats(words, score_fn=BigramAssocMeasures.chi_sq, n=200, freq=5):
    bigram_finder = BigramCollocationFinder.from_words(words)
    bigram_finder.apply_freq_filter(freq)
    bigrams = bigram_finder.nbest(score_fn, n)

    with open('best_{0}_bigram_collocations_more_than_{1}_occurrences_pmi.txt'.format(n, freq), 'wb') as f:
        for b in bigrams:
            f.write("{0} {1}\n".format(b[0], b[1]))

    return bigrams

best_bigrams = best_bigram_word_feats(text2, score_fn=BigramAssocMeasures.pmi, n=200, freq=20)
	# all_summary is the data here. in this case, just a lot of text records collapsed into on corpus string

	# Prep data for NLTK Analysis
	import nltk.collocations
	tokens = nltk.word_tokenize(all_summary)
	text = nltk.Text(tokens)

	# Remove stop-words, convert to lower-case, remove all non-alpha characters
	from nltk.corpus import stopwords
	stopwords = stopwords.words('english')

	text1 = nltk.Text([w.lower() for w in text if w.isalpha()])
	text2 = [w for w in text1 if w not in stopwords]

	# 200 Best 'Bigram Collocations' by Score - Write to File

	from nltk.collocations import BigramCollocationFinder
	from nltk.metrics import BigramAssocMeasures
	from nltk import collocations

	def best_bigram_word_feats(words, score_fn=BigramAssocMeasures.chi_sq, n=200, freq=5):
	bigram_finder = BigramCollocationFinder.from_words(words)
	bigram_finder.apply_freq_filter(freq)
	bigrams = bigram_finder.nbest(score_fn, n)

	with open('best_{0}_bigram_collocations_more_than_{1}_occurrences_pmi.txt'.format(n, freq), 'wb') as f:
	for b in bigrams:
	f.write("{0} {1}\n".format(b[0], b[1]))

	return bigrams

	best_bigrams = best_bigram_word_feats(text2, score_fn=BigramAssocMeasures.pmi, n=200, freq=20)