neuman/similarity.py

## similarity.py
from __future__ import division, unicode_literals
import math
from textblob import TextBlob as tb
from goose import Goose

def tf(word, blob):
    return blob.words.count(word) / len(blob.words)

def n_containing(word, bloblist):
    return sum(1 for blob in bloblist if word in blob)

def idf(word, bloblist):
    return math.log(len(bloblist) / (1 + n_containing(word, bloblist)))

def tfidf(word, blob, bloblist):
    return tf(word, blob) * idf(word, bloblist)

def find_keywords_across_articles(articles):
	g = Goose()
	bloblist = []
	for a in articles:
		try:
			blob = tb(g.extract(url=a.url).cleaned_text)
			bloblist.append(blob)
		except Exception as e:
			print 'problem!'

	for i, blob in enumerate(bloblist):
	    print("Top words in document {}".format(i + 1))
	    scores = {word: tfidf(word, blob, bloblist) for word in blob.words}
	    sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True)
	    for word, score in sorted_words[:6]:
	        print("\tWord: {}, TF-IDF: {}".format(word, round(score, 5)))
	from __future__ import division, unicode_literals
	import math
	from textblob import TextBlob as tb
	from goose import Goose

	def tf(word, blob):
	return blob.words.count(word) / len(blob.words)

	def n_containing(word, bloblist):
	return sum(1 for blob in bloblist if word in blob)

	def idf(word, bloblist):
	return math.log(len(bloblist) / (1 + n_containing(word, bloblist)))

	def tfidf(word, blob, bloblist):
	return tf(word, blob) * idf(word, bloblist)

	def find_keywords_across_articles(articles):
	g = Goose()
	bloblist = []
	for a in articles:
	try:
	blob = tb(g.extract(url=a.url).cleaned_text)
	bloblist.append(blob)
	except Exception as e:
	print 'problem!'

	for i, blob in enumerate(bloblist):
	print("Top words in document {}".format(i + 1))
	scores = {word: tfidf(word, blob, bloblist) for word in blob.words}
	sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True)
	for word, score in sorted_words[:6]:
	print("\tWord: {}, TF-IDF: {}".format(word, round(score, 5)))