davidfauth/pythonBiGram.py

## pythonBiGram.py
@outputSchema("top_five:bag{t:(bigram:chararray)}")
def top5_bigrams(textDescription):
    sentences = nltk.tokenize.sent_tokenize(textDescription)
    tokens = [nltk.tokenize.word_tokenize(s) for s in sentences]

    bgm    = nltk.collocations.BigramAssocMeasures()
    finder = nltk.collocations.BigramCollocationFinder.from_documents(tokens)
    top_5  = finder.nbest(bgm.likelihood_ratio, 5)

    return [ ("%s %s" % (s[0], s[1]),) for s in top_5 ]
	@outputSchema("top_five:bag{t:(bigram:chararray)}")
	def top5_bigrams(textDescription):
	sentences = nltk.tokenize.sent_tokenize(textDescription)
	tokens = [nltk.tokenize.word_tokenize(s) for s in sentences]

	bgm = nltk.collocations.BigramAssocMeasures()
	finder = nltk.collocations.BigramCollocationFinder.from_documents(tokens)
	top_5 = finder.nbest(bgm.likelihood_ratio, 5)

	return [ ("%s %s" % (s[0], s[1]),) for s in top_5 ]