napsternxg/phrases.py

## phrases.py
import nltk
import string
from collections import Counter

def untokenize(ngram):
    tokens = list(ngram)
    return "".join([" "+i if not i.startswith("'") and \
                             i not in string.punctuation and \
                             i != "n't"
                          else i for i in tokens]).strip()


def extract_phrases(text, phrase_counter, length):
    for sent in nltk.sent_tokenize(text):
        words = nltk.word_tokenize(sent)
        for phrase in nltk.util.ngrams(words, length):
            if all(word not in string.punctuation for word in phrase):
                phrase_counter[untokenize(phrase)] += 1


if __name__ == "__main__":
  phrase_counter = Counter()
  sent = ["This is good", "This is awesome", "Weather is good"]
  for s in sent:
    extract_phrases(s,phrase_counter, 2)
  print phrase_counter.most_common(5)
	import nltk
	import string
	from collections import Counter

	def untokenize(ngram):
	tokens = list(ngram)
	return "".join([" "+i if not i.startswith("'") and \
	i not in string.punctuation and \
	i != "n't"
	else i for i in tokens]).strip()


	def extract_phrases(text, phrase_counter, length):
	for sent in nltk.sent_tokenize(text):
	words = nltk.word_tokenize(sent)
	for phrase in nltk.util.ngrams(words, length):
	if all(word not in string.punctuation for word in phrase):
	phrase_counter[untokenize(phrase)] += 1


	if __name__ == "__main__":
	phrase_counter = Counter()
	sent = ["This is good", "This is awesome", "Weather is good"]
	for s in sent:
	extract_phrases(s,phrase_counter, 2)
	print phrase_counter.most_common(5)