Created
April 30, 2013 19:37
-
-
Save ismaelc/5491316 to your computer and use it in GitHub Desktop.
Summarize algorithm adapted for Flask and PythonAnywhere (based from http://thetokenizer.com/2013/04/28/build-your-own-summary-tool/)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding=UTF-8 | |
from __future__ import division | |
import re, urllib2 | |
from flask import Flask, request, jsonify | |
app = Flask(__name__) | |
@app.route('/') | |
def hello_world(): | |
return 'Hello from Flask!' | |
@app.route('/summarize') | |
def summarize(): | |
title = urllib2.unquote(request.args.get('title')) | |
content = urllib2.unquote(request.args.get('content')) | |
st = SummaryTool() | |
sentences_dic = st.get_sentences_ranks(content) | |
summary = st.get_summary(title, content, sentences_dic) | |
return jsonify(result=summary) | |
# This is a naive text summarization algorithm | |
# Created by Shlomi Babluki | |
# April, 2013 | |
class SummaryTool(object): | |
# Naive method for splitting a text into sentences | |
def split_content_to_sentences(self, content): | |
content = content.replace("\n", ". ") | |
return content.split(". ") | |
# Naive method for splitting a text into paragraphs | |
def split_content_to_paragraphs(self, content): | |
return content.split("\n\n") | |
# Caculate the intersection between 2 sentences | |
def sentences_intersection(self, sent1, sent2): | |
# split the sentence into words/tokens | |
s1 = set(sent1.split(" ")) | |
s2 = set(sent2.split(" ")) | |
# If there is not intersection, just return 0 | |
if (len(s1) + len(s2)) == 0: | |
return 0 | |
# We normalize the result by the average number of words | |
return len(s1.intersection(s2)) / ((len(s1) + len(s2)) / 2) | |
# Format a sentence - remove all non-alphbetic chars from the sentence | |
# We'll use the formatted sentence as a key in our sentences dictionary | |
def format_sentence(self, sentence): | |
sentence = re.sub(r'\W+', '', sentence) | |
return sentence | |
# Convert the content into a dictionary <K, V> | |
# k = The formatted sentence | |
# V = The rank of the sentence | |
def get_sentences_ranks(self, content): | |
# Split the content into sentences | |
sentences = self.split_content_to_sentences(content) | |
# Calculate the intersection of every two sentences | |
n = len(sentences) | |
values = [[0 for x in xrange(n)] for x in xrange(n)] | |
for i in range(0, n): | |
for j in range(0, n): | |
values[i][j] = self.sentences_intersection(sentences[i], sentences[j]) | |
# Build the sentences dictionary | |
# The score of a sentences is the sum of all its intersection | |
sentences_dic = {} | |
for i in range(0, n): | |
score = 0 | |
for j in range(0, n): | |
if i == j: | |
continue | |
score += values[i][j] | |
sentences_dic[self.format_sentence(sentences[i])] = score | |
return sentences_dic | |
# Return the best sentence in a paragraph | |
def get_best_sentence(self, paragraph, sentences_dic): | |
# Split the paragraph into sentences | |
sentences = self.split_content_to_sentences(paragraph) | |
# Ignore short paragraphs | |
if len(sentences) < 2: | |
return "" | |
# Get the best sentence according to the sentences dictionary | |
best_sentence = "" | |
max_value = 0 | |
for s in sentences: | |
strip_s = self.format_sentence(s) | |
if strip_s: | |
if sentences_dic[strip_s] > max_value: | |
max_value = sentences_dic[strip_s] | |
best_sentence = s | |
return best_sentence | |
# Build the summary | |
def get_summary(self, title, content, sentences_dic): | |
# Split the content into paragraphs | |
paragraphs = self.split_content_to_paragraphs(content) | |
# Add the title | |
summary = [] | |
summary.append(title.strip()) | |
summary.append("") | |
# Add the best sentence from each paragraph | |
for p in paragraphs: | |
sentence = self.get_best_sentence(p, sentences_dic).strip() | |
if sentence: | |
summary.append(sentence) | |
return ("\n").join(summary) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment