Skip to content

Instantly share code, notes, and snippets.

@neuman
Created May 17, 2016 00:22
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save neuman/366e24c3aa8eb93514e20201b1c1589e to your computer and use it in GitHub Desktop.
Save neuman/366e24c3aa8eb93514e20201b1c1589e to your computer and use it in GitHub Desktop.
from __future__ import division, unicode_literals
import math
from textblob import TextBlob as tb
from goose import Goose
def tf(word, blob):
return blob.words.count(word) / len(blob.words)
def n_containing(word, bloblist):
return sum(1 for blob in bloblist if word in blob)
def idf(word, bloblist):
return math.log(len(bloblist) / (1 + n_containing(word, bloblist)))
def tfidf(word, blob, bloblist):
return tf(word, blob) * idf(word, bloblist)
def find_keywords_across_articles(articles):
g = Goose()
bloblist = []
for a in articles:
try:
blob = tb(g.extract(url=a.url).cleaned_text)
bloblist.append(blob)
except Exception as e:
print 'problem!'
for i, blob in enumerate(bloblist):
print("Top words in document {}".format(i + 1))
scores = {word: tfidf(word, blob, bloblist) for word in blob.words}
sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True)
for word, score in sorted_words[:6]:
print("\tWord: {}, TF-IDF: {}".format(word, round(score, 5)))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment