jackschultz/article-summarizer.py

## article-summarizer.py
import nltk
from nltk.stem.wordnet import WordNetLemmatizer
import string

class SentenceRank(object):
  def __init__(self, body, title):

    self.body = body
    self.sentence_list = nltk.tokenize.sent_tokenize(self.body)[:]
    self.title = title
    self.body = self.title+ '. ' +self.body
    self.sentence_clean()
    self.title_importance = 2


  def title_clean(self):
    split = self.word_split(self.title)
    tokens = nltk.pos_tag(split)
    sing_toks = []
    for key,value in tokens:
      if value[0] == 'N':
        sing_toks.append(self.word_stem(key))
      elif value[0] == 'V':
        sing_toks.append(self.word_stem(key, pos='v'))
      else:
        sing_toks.append(key)
    self.sentences.append(self.remove_stopwords(sing_toks))


  def sentence_clean(self):
    '''
    This cleans the sentences, and returns just the word tokens, without
    stopwords, and all singular
    '''

    sents = nltk.tokenize.sent_tokenize(self.body)

    self.sentences = []
    for sen in sents:
      split = self.word_split(sen)
      tokens = nltk.pos_tag(split)
      sing_toks = []
      for key,value in tokens:
        if value[0] == 'N':
          sing_toks.append(self.word_stem(key))
        elif value[0] == 'V':
          sing_toks.append(self.word_stem(key, pos='v'))
        else:
          sing_toks.append(key)
      important = self.remove_stopwords(sing_toks)

      self.sentences.append(important)


  def sentence_intersection(self, s1, s2):
    '''
    Calculates the intersection of the sentences and standardizes on length
    '''
    set1 = set(s1)
    set2 = set(s2)

    #we also want to make all singular, but we need the pos tag as well
    #then we pass that and either n or v into the function as well


    #set1 = set(self.remove_stopwords(self.word_split(s1)))
    #set2 = set(self.remove_stopwords(self.word_split(s2)))


    if (len(set1) + len(set2)) == 0:
      return 0

    return len(set.intersection(set1,set2)) / ((len(set1) + len(set2)) / 2.0)

  def word_split(self,sentence):
    '''
    splits the sentence into a list of words
    '''
    return nltk.word_tokenize(sentence.translate(None,string.punctuation))

  def word_pos_tag(self, sentence):
    '''
    Tokenizes the list of words
    '''
    return nltk.pos_tag(sentence)

  def remove_stopwords(self, words):
    stopwords = nltk.corpus.stopwords.words('english')
    return [w for w in words if w.lower() not in stopwords]

  def sentence_content(self, sentence):
    '''
    Determines the content fraction ie non common words
    '''
    stopwords = nltk.corpus.stopwords.words('english')
    words = self.word_split(sentence)

    if len(words) == 0:
      return 0

    content = [w for w in words if w.lower() not in stopwords]
    return len(content) / float(len(words))

  def word_stem(self, word, pos='n'):
    wnl = WordNetLemmatizer()
    return wnl.lemmatize(word, pos)

  def rank_sentences(self):
    n = len(self.sentences)
    values = [[0 for x in xrange(n)] for x in xrange(n)]
    for i in range(0, n):
      for j in range(0, n):
        values[i][j] = self.sentence_intersection(self.sentences[i], self.sentences[j])
        if i == 0 or j == 0:
          values[i][j] *= self.title_importance
    sent_dict = {}
    for i in range(1, n):
      score = 0
      for j in range(1, n):
        if i == j:
          continue
        score += values[i][j]
        sent_dict[i] = score

    return sent_dict


if __name__ == '__main__':

  from goose import Goose
  url = ''
  number = 8
  #url = 'http://www.jsonline.com/sports/golf/sherri-steinhauer-riding-high-on-legends-tour-b9984666z1-221411071.html'
  g = Goose()
  info = g.extract(url=url)
  content = info.cleaned_text.encode('ascii','replace').replace('?',' ')
  title = info.title.encode('ascii','ignore')
  sr = SentenceRank(content, title)
  ranks = sr.rank_sentences()
  import operator
  sorted_ranks = sorted(ranks.iteritems(), key=operator.itemgetter(1),reverse=True)
  content = []
  asdf = 0

  ranks_key = [key for (key,val) in sorted_ranks]
  ranks_value = [val for (key,val) in sorted_ranks]

  #now we want to sort the sentences in the top X by number and print in order!
  print
  print 'doing ranks'
  print
  print ranks_key
  print
  print sum(ranks_value)
  print
  for k in sorted(ranks_key[0:number]):
    print 'Sent ' + str(k) + ': ' + sr.sentence_list[k-1].strip('\n')
	import nltk
	from nltk.stem.wordnet import WordNetLemmatizer
	import string

	class SentenceRank(object):
	def __init__(self, body, title):

	self.body = body
	self.sentence_list = nltk.tokenize.sent_tokenize(self.body)[:]
	self.title = title
	self.body = self.title+ '. ' +self.body
	self.sentence_clean()
	self.title_importance = 2


	def title_clean(self):
	split = self.word_split(self.title)
	tokens = nltk.pos_tag(split)
	sing_toks = []
	for key,value in tokens:
	if value[0] == 'N':
	sing_toks.append(self.word_stem(key))
	elif value[0] == 'V':
	sing_toks.append(self.word_stem(key, pos='v'))
	else:
	sing_toks.append(key)
	self.sentences.append(self.remove_stopwords(sing_toks))


	def sentence_clean(self):
	'''
	This cleans the sentences, and returns just the word tokens, without
	stopwords, and all singular
	'''

	sents = nltk.tokenize.sent_tokenize(self.body)

	self.sentences = []
	for sen in sents:
	split = self.word_split(sen)
	tokens = nltk.pos_tag(split)
	sing_toks = []
	for key,value in tokens:
	if value[0] == 'N':
	sing_toks.append(self.word_stem(key))
	elif value[0] == 'V':
	sing_toks.append(self.word_stem(key, pos='v'))
	else:
	sing_toks.append(key)
	important = self.remove_stopwords(sing_toks)

	self.sentences.append(important)


	def sentence_intersection(self, s1, s2):
	'''
	Calculates the intersection of the sentences and standardizes on length
	'''
	set1 = set(s1)
	set2 = set(s2)

	#we also want to make all singular, but we need the pos tag as well
	#then we pass that and either n or v into the function as well


	#set1 = set(self.remove_stopwords(self.word_split(s1)))
	#set2 = set(self.remove_stopwords(self.word_split(s2)))


	if (len(set1) + len(set2)) == 0:
	return 0

	return len(set.intersection(set1,set2)) / ((len(set1) + len(set2)) / 2.0)

	def word_split(self,sentence):
	'''
	splits the sentence into a list of words
	'''
	return nltk.word_tokenize(sentence.translate(None,string.punctuation))

	def word_pos_tag(self, sentence):
	'''
	Tokenizes the list of words
	'''
	return nltk.pos_tag(sentence)

	def remove_stopwords(self, words):
	stopwords = nltk.corpus.stopwords.words('english')
	return [w for w in words if w.lower() not in stopwords]

	def sentence_content(self, sentence):
	'''
	Determines the content fraction ie non common words
	'''
	stopwords = nltk.corpus.stopwords.words('english')
	words = self.word_split(sentence)

	if len(words) == 0:
	return 0

	content = [w for w in words if w.lower() not in stopwords]
	return len(content) / float(len(words))

	def word_stem(self, word, pos='n'):
	wnl = WordNetLemmatizer()
	return wnl.lemmatize(word, pos)

	def rank_sentences(self):
	n = len(self.sentences)
	values = [[0 for x in xrange(n)] for x in xrange(n)]
	for i in range(0, n):
	for j in range(0, n):
	values[i][j] = self.sentence_intersection(self.sentences[i], self.sentences[j])
	if i == 0 or j == 0:
	values[i][j] *= self.title_importance
	sent_dict = {}
	for i in range(1, n):
	score = 0
	for j in range(1, n):
	if i == j:
	continue
	score += values[i][j]
	sent_dict[i] = score

	return sent_dict




	if __name__ == '__main__':

	from goose import Goose
	url = ''
	number = 8
	#url = 'http://www.jsonline.com/sports/golf/sherri-steinhauer-riding-high-on-legends-tour-b9984666z1-221411071.html'
	g = Goose()
	info = g.extract(url=url)
	content = info.cleaned_text.encode('ascii','replace').replace('?',' ')
	title = info.title.encode('ascii','ignore')
	sr = SentenceRank(content, title)
	ranks = sr.rank_sentences()
	import operator
	sorted_ranks = sorted(ranks.iteritems(), key=operator.itemgetter(1),reverse=True)
	content = []
	asdf = 0

	ranks_key = [key for (key,val) in sorted_ranks]
	ranks_value = [val for (key,val) in sorted_ranks]

	#now we want to sort the sentences in the top X by number and print in order!
	print
	print 'doing ranks'
	print
	print ranks_key
	print
	print sum(ranks_value)
	print
	for k in sorted(ranks_key[0:number]):
	print 'Sent ' + str(k) + ': ' + sr.sentence_list[k-1].strip('\n')