Skip to content

Instantly share code, notes, and snippets.

@jackschultz
Created August 30, 2013 23:21
Show Gist options
  • Save jackschultz/6395210 to your computer and use it in GitHub Desktop.
Save jackschultz/6395210 to your computer and use it in GitHub Desktop.
Article summarizer written in python.
import nltk
from nltk.stem.wordnet import WordNetLemmatizer
import string
class SentenceRank(object):
def __init__(self, body, title):
self.body = body
self.sentence_list = nltk.tokenize.sent_tokenize(self.body)[:]
self.title = title
self.body = self.title+ '. ' +self.body
self.sentence_clean()
self.title_importance = 2
def title_clean(self):
split = self.word_split(self.title)
tokens = nltk.pos_tag(split)
sing_toks = []
for key,value in tokens:
if value[0] == 'N':
sing_toks.append(self.word_stem(key))
elif value[0] == 'V':
sing_toks.append(self.word_stem(key, pos='v'))
else:
sing_toks.append(key)
self.sentences.append(self.remove_stopwords(sing_toks))
def sentence_clean(self):
'''
This cleans the sentences, and returns just the word tokens, without
stopwords, and all singular
'''
sents = nltk.tokenize.sent_tokenize(self.body)
self.sentences = []
for sen in sents:
split = self.word_split(sen)
tokens = nltk.pos_tag(split)
sing_toks = []
for key,value in tokens:
if value[0] == 'N':
sing_toks.append(self.word_stem(key))
elif value[0] == 'V':
sing_toks.append(self.word_stem(key, pos='v'))
else:
sing_toks.append(key)
important = self.remove_stopwords(sing_toks)
self.sentences.append(important)
def sentence_intersection(self, s1, s2):
'''
Calculates the intersection of the sentences and standardizes on length
'''
set1 = set(s1)
set2 = set(s2)
#we also want to make all singular, but we need the pos tag as well
#then we pass that and either n or v into the function as well
#set1 = set(self.remove_stopwords(self.word_split(s1)))
#set2 = set(self.remove_stopwords(self.word_split(s2)))
if (len(set1) + len(set2)) == 0:
return 0
return len(set.intersection(set1,set2)) / ((len(set1) + len(set2)) / 2.0)
def word_split(self,sentence):
'''
splits the sentence into a list of words
'''
return nltk.word_tokenize(sentence.translate(None,string.punctuation))
def word_pos_tag(self, sentence):
'''
Tokenizes the list of words
'''
return nltk.pos_tag(sentence)
def remove_stopwords(self, words):
stopwords = nltk.corpus.stopwords.words('english')
return [w for w in words if w.lower() not in stopwords]
def sentence_content(self, sentence):
'''
Determines the content fraction ie non common words
'''
stopwords = nltk.corpus.stopwords.words('english')
words = self.word_split(sentence)
if len(words) == 0:
return 0
content = [w for w in words if w.lower() not in stopwords]
return len(content) / float(len(words))
def word_stem(self, word, pos='n'):
wnl = WordNetLemmatizer()
return wnl.lemmatize(word, pos)
def rank_sentences(self):
n = len(self.sentences)
values = [[0 for x in xrange(n)] for x in xrange(n)]
for i in range(0, n):
for j in range(0, n):
values[i][j] = self.sentence_intersection(self.sentences[i], self.sentences[j])
if i == 0 or j == 0:
values[i][j] *= self.title_importance
sent_dict = {}
for i in range(1, n):
score = 0
for j in range(1, n):
if i == j:
continue
score += values[i][j]
sent_dict[i] = score
return sent_dict
if __name__ == '__main__':
from goose import Goose
url = ''
number = 8
#url = 'http://www.jsonline.com/sports/golf/sherri-steinhauer-riding-high-on-legends-tour-b9984666z1-221411071.html'
g = Goose()
info = g.extract(url=url)
content = info.cleaned_text.encode('ascii','replace').replace('?',' ')
title = info.title.encode('ascii','ignore')
sr = SentenceRank(content, title)
ranks = sr.rank_sentences()
import operator
sorted_ranks = sorted(ranks.iteritems(), key=operator.itemgetter(1),reverse=True)
content = []
asdf = 0
ranks_key = [key for (key,val) in sorted_ranks]
ranks_value = [val for (key,val) in sorted_ranks]
#now we want to sort the sentences in the top X by number and print in order!
print
print 'doing ranks'
print
print ranks_key
print
print sum(ranks_value)
print
for k in sorted(ranks_key[0:number]):
print 'Sent ' + str(k) + ': ' + sr.sentence_list[k-1].strip('\n')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment