public
Created

Summarize algorithm adapted for Flask and PythonAnywhere (based from http://thetokenizer.com/2013/04/28/build-your-own-summary-tool/)

  • Download Gist
summarize.py
Python
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124
# coding=UTF-8
from __future__ import division
import re, urllib2
 
from flask import Flask, request, jsonify
app = Flask(__name__)
 
@app.route('/')
def hello_world():
return 'Hello from Flask!'
 
@app.route('/summarize')
def summarize():
title = urllib2.unquote(request.args.get('title'))
content = urllib2.unquote(request.args.get('content'))
 
st = SummaryTool()
sentences_dic = st.get_sentences_ranks(content)
summary = st.get_summary(title, content, sentences_dic)
return jsonify(result=summary)
 
 
# This is a naive text summarization algorithm
# Created by Shlomi Babluki
# April, 2013
 
class SummaryTool(object):
 
# Naive method for splitting a text into sentences
def split_content_to_sentences(self, content):
content = content.replace("\n", ". ")
return content.split(". ")
 
# Naive method for splitting a text into paragraphs
def split_content_to_paragraphs(self, content):
return content.split("\n\n")
 
# Caculate the intersection between 2 sentences
def sentences_intersection(self, sent1, sent2):
 
# split the sentence into words/tokens
s1 = set(sent1.split(" "))
s2 = set(sent2.split(" "))
 
# If there is not intersection, just return 0
if (len(s1) + len(s2)) == 0:
return 0
 
# We normalize the result by the average number of words
return len(s1.intersection(s2)) / ((len(s1) + len(s2)) / 2)
 
# Format a sentence - remove all non-alphbetic chars from the sentence
# We'll use the formatted sentence as a key in our sentences dictionary
def format_sentence(self, sentence):
sentence = re.sub(r'\W+', '', sentence)
return sentence
 
# Convert the content into a dictionary <K, V>
# k = The formatted sentence
# V = The rank of the sentence
def get_sentences_ranks(self, content):
 
# Split the content into sentences
sentences = self.split_content_to_sentences(content)
 
# Calculate the intersection of every two sentences
n = len(sentences)
values = [[0 for x in xrange(n)] for x in xrange(n)]
for i in range(0, n):
for j in range(0, n):
values[i][j] = self.sentences_intersection(sentences[i], sentences[j])
 
# Build the sentences dictionary
# The score of a sentences is the sum of all its intersection
sentences_dic = {}
for i in range(0, n):
score = 0
for j in range(0, n):
if i == j:
continue
score += values[i][j]
sentences_dic[self.format_sentence(sentences[i])] = score
return sentences_dic
 
# Return the best sentence in a paragraph
def get_best_sentence(self, paragraph, sentences_dic):
 
# Split the paragraph into sentences
sentences = self.split_content_to_sentences(paragraph)
 
# Ignore short paragraphs
if len(sentences) < 2:
return ""
 
# Get the best sentence according to the sentences dictionary
best_sentence = ""
max_value = 0
for s in sentences:
strip_s = self.format_sentence(s)
if strip_s:
if sentences_dic[strip_s] > max_value:
max_value = sentences_dic[strip_s]
best_sentence = s
 
return best_sentence
 
# Build the summary
def get_summary(self, title, content, sentences_dic):
 
# Split the content into paragraphs
paragraphs = self.split_content_to_paragraphs(content)
 
# Add the title
summary = []
summary.append(title.strip())
summary.append("")
 
# Add the best sentence from each paragraph
for p in paragraphs:
sentence = self.get_best_sentence(p, sentences_dic).strip()
if sentence:
summary.append(sentence)
 
return ("\n").join(summary)

Please sign in to comment on this gist.

Something went wrong with that request. Please try again.