Skip to content

Instantly share code, notes, and snippets.

@napsternxg
Created August 11, 2015 21:23
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save napsternxg/49e186a4cec4fd1682fe to your computer and use it in GitHub Desktop.
Save napsternxg/49e186a4cec4fd1682fe to your computer and use it in GitHub Desktop.
import nltk
import string
from collections import Counter
def untokenize(ngram):
tokens = list(ngram)
return "".join([" "+i if not i.startswith("'") and \
i not in string.punctuation and \
i != "n't"
else i for i in tokens]).strip()
def extract_phrases(text, phrase_counter, length):
for sent in nltk.sent_tokenize(text):
words = nltk.word_tokenize(sent)
for phrase in nltk.util.ngrams(words, length):
if all(word not in string.punctuation for word in phrase):
phrase_counter[untokenize(phrase)] += 1
if __name__ == "__main__":
phrase_counter = Counter()
sent = ["This is good", "This is awesome", "Weather is good"]
for s in sent:
extract_phrases(s,phrase_counter, 2)
print phrase_counter.most_common(5)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment