Skip to content

Instantly share code, notes, and snippets.

@neuman
Created September 14, 2015 18:00
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save neuman/cac2014d76dd1343c6be to your computer and use it in GitHub Desktop.
Save neuman/cac2014d76dd1343c6be to your computer and use it in GitHub Desktop.
Find Keywords Across Documents
from __future__ import division, unicode_literals
import math
from textblob import TextBlob as tb
from goose import Goose
def tf(word, blob):
return blob.words.count(word) / len(blob.words)
def n_containing(word, bloblist):
return sum(1 for blob in bloblist if word in blob)
def idf(word, bloblist):
return math.log(len(bloblist) / (1 + n_containing(word, bloblist)))
def tfidf(word, blob, bloblist):
return tf(word, blob) * idf(word, bloblist)
document1 = tb("""Python is a 2000 made-for-TV horror movie directed by Richard
Clabaugh. The film features several cult favorite actors, including William
Zabka of The Karate Kid fame, Wil Wheaton, Casper Van Dien, Jenny McCarthy,
Keith Coogan, Robert Englund (best known for his role as Freddy Krueger in the
A Nightmare on Elm Street series of films), Dana Barron, David Bowe, and Sean
Whalen. The film concerns a genetically engineered snake, a python, that
escapes and unleashes itself on a small town. It includes the classic final
girl scenario evident in films like Friday the 13th. It was filmed in Los Angeles,
California and Malibu, California. Python was followed by two sequels: Python
II (2002) and Boa vs. Python (2004), both also made-for-TV films.""")
def find_keywords_across_articles(articles):
g = Goose()
bloblist = []
for a in articles:
try:
blob = tb(g.extract(url=a.url).cleaned_text)
bloblist.append(blob)
except Exception as e:
print 'problem!'
for i, blob in enumerate(bloblist):
print("Top words in document {}".format(i + 1))
scores = {word: tfidf(word, blob, bloblist) for word in blob.words}
sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True)
for word, score in sorted_words[:6]:
print("\tWord: {}, TF-IDF: {}".format(word, round(score, 5)))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment