Created
August 30, 2013 23:21
-
-
Save jackschultz/6395210 to your computer and use it in GitHub Desktop.
Article summarizer written in python.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import nltk | |
from nltk.stem.wordnet import WordNetLemmatizer | |
import string | |
class SentenceRank(object): | |
def __init__(self, body, title): | |
self.body = body | |
self.sentence_list = nltk.tokenize.sent_tokenize(self.body)[:] | |
self.title = title | |
self.body = self.title+ '. ' +self.body | |
self.sentence_clean() | |
self.title_importance = 2 | |
def title_clean(self): | |
split = self.word_split(self.title) | |
tokens = nltk.pos_tag(split) | |
sing_toks = [] | |
for key,value in tokens: | |
if value[0] == 'N': | |
sing_toks.append(self.word_stem(key)) | |
elif value[0] == 'V': | |
sing_toks.append(self.word_stem(key, pos='v')) | |
else: | |
sing_toks.append(key) | |
self.sentences.append(self.remove_stopwords(sing_toks)) | |
def sentence_clean(self): | |
''' | |
This cleans the sentences, and returns just the word tokens, without | |
stopwords, and all singular | |
''' | |
sents = nltk.tokenize.sent_tokenize(self.body) | |
self.sentences = [] | |
for sen in sents: | |
split = self.word_split(sen) | |
tokens = nltk.pos_tag(split) | |
sing_toks = [] | |
for key,value in tokens: | |
if value[0] == 'N': | |
sing_toks.append(self.word_stem(key)) | |
elif value[0] == 'V': | |
sing_toks.append(self.word_stem(key, pos='v')) | |
else: | |
sing_toks.append(key) | |
important = self.remove_stopwords(sing_toks) | |
self.sentences.append(important) | |
def sentence_intersection(self, s1, s2): | |
''' | |
Calculates the intersection of the sentences and standardizes on length | |
''' | |
set1 = set(s1) | |
set2 = set(s2) | |
#we also want to make all singular, but we need the pos tag as well | |
#then we pass that and either n or v into the function as well | |
#set1 = set(self.remove_stopwords(self.word_split(s1))) | |
#set2 = set(self.remove_stopwords(self.word_split(s2))) | |
if (len(set1) + len(set2)) == 0: | |
return 0 | |
return len(set.intersection(set1,set2)) / ((len(set1) + len(set2)) / 2.0) | |
def word_split(self,sentence): | |
''' | |
splits the sentence into a list of words | |
''' | |
return nltk.word_tokenize(sentence.translate(None,string.punctuation)) | |
def word_pos_tag(self, sentence): | |
''' | |
Tokenizes the list of words | |
''' | |
return nltk.pos_tag(sentence) | |
def remove_stopwords(self, words): | |
stopwords = nltk.corpus.stopwords.words('english') | |
return [w for w in words if w.lower() not in stopwords] | |
def sentence_content(self, sentence): | |
''' | |
Determines the content fraction ie non common words | |
''' | |
stopwords = nltk.corpus.stopwords.words('english') | |
words = self.word_split(sentence) | |
if len(words) == 0: | |
return 0 | |
content = [w for w in words if w.lower() not in stopwords] | |
return len(content) / float(len(words)) | |
def word_stem(self, word, pos='n'): | |
wnl = WordNetLemmatizer() | |
return wnl.lemmatize(word, pos) | |
def rank_sentences(self): | |
n = len(self.sentences) | |
values = [[0 for x in xrange(n)] for x in xrange(n)] | |
for i in range(0, n): | |
for j in range(0, n): | |
values[i][j] = self.sentence_intersection(self.sentences[i], self.sentences[j]) | |
if i == 0 or j == 0: | |
values[i][j] *= self.title_importance | |
sent_dict = {} | |
for i in range(1, n): | |
score = 0 | |
for j in range(1, n): | |
if i == j: | |
continue | |
score += values[i][j] | |
sent_dict[i] = score | |
return sent_dict | |
if __name__ == '__main__': | |
from goose import Goose | |
url = '' | |
number = 8 | |
#url = 'http://www.jsonline.com/sports/golf/sherri-steinhauer-riding-high-on-legends-tour-b9984666z1-221411071.html' | |
g = Goose() | |
info = g.extract(url=url) | |
content = info.cleaned_text.encode('ascii','replace').replace('?',' ') | |
title = info.title.encode('ascii','ignore') | |
sr = SentenceRank(content, title) | |
ranks = sr.rank_sentences() | |
import operator | |
sorted_ranks = sorted(ranks.iteritems(), key=operator.itemgetter(1),reverse=True) | |
content = [] | |
asdf = 0 | |
ranks_key = [key for (key,val) in sorted_ranks] | |
ranks_value = [val for (key,val) in sorted_ranks] | |
#now we want to sort the sentences in the top X by number and print in order! | |
print 'doing ranks' | |
print ranks_key | |
print sum(ranks_value) | |
for k in sorted(ranks_key[0:number]): | |
print 'Sent ' + str(k) + ': ' + sr.sentence_list[k-1].strip('\n') | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment