ptwobrussell/summarize.py

## summarize.py
########################################################################
#
# An example of how to deploy a custom predictive model to yhat
# and "predict" the summary for a news article.
#
# Input: URL for a web page containing a news article
#
# Output: Summary of the "story" in the web page for the URL
#
# Example usage: $ python summarizer.py <username> <apikey> <url>
#
########################################################################

# For getting command line args

import sys

# For pretty-printing API responses

import json

# These libs are provided by yhat on the cloud server and used
# in the summarize function below

import nltk
import numpy

# Had to ask for this to be specially installed on the cloud server

from boilerpipe.extract import Extractor

# Get this via "pip install yhat"

from yhat import Yhat, BaseModel


# This summarize function is taken from Mining the Social Web - http://bit.ly/1dIqdNd

def summarize(url=None, html=None, n=100, cluster_threshold=5, top_sentences=5):

    # Adapted from "The Automatic Creation of Literature Abstracts" by H.P. Luhn
    #
    # Parameters:
    # * n  - Number of words to consider
    # * cluster_threshold - Distance between words to consider
    # * top_sentences - Number of sentences to return for a "top n" summary

    # Begin - nested helper function
    def score_sentences(sentences, important_words):
        scores = []
        sentence_idx = -1

        for s in [nltk.tokenize.word_tokenize(s) for s in sentences]:

            sentence_idx += 1
            word_idx = []

            # For each word in the word list...
            for w in important_words:
                try:
                    # Compute an index for important words in each sentence

                    word_idx.append(s.index(w))
                except ValueError, e: # w not in this particular sentence
                    pass

            word_idx.sort()

            # It is possible that some sentences may not contain any important words
            if len(word_idx)== 0: continue

            # Using the word index, compute clusters with a max distance threshold
            # for any two consecutive words

            clusters = []
            cluster = [word_idx[0]]
            i = 1
            while i < len(word_idx):
                if word_idx[i] - word_idx[i - 1] < cluster_threshold:
                    cluster.append(word_idx[i])
                else:
                    clusters.append(cluster[:])
                    cluster = [word_idx[i]]
                i += 1
            clusters.append(cluster)

            # Score each cluster. The max score for any given cluster is the score
            # for the sentence.

            max_cluster_score = 0
            for c in clusters:
                significant_words_in_cluster = len(c)
                total_words_in_cluster = c[-1] - c[0] + 1
                score = 1.0 * significant_words_in_cluster \
                    * significant_words_in_cluster / total_words_in_cluster

                if score > max_cluster_score:
                    max_cluster_score = score

            scores.append((sentence_idx, score))

        return scores

    # End - nested helper function

    extractor = Extractor(extractor='ArticleExtractor', url=url, html=html)

    # It's entirely possible that this "clean page" will be a big mess. YMMV.
    # The good news is that the summarize algorithm inherently accounts for handling
    # a lot of this noise.

    txt = extractor.getText()

    sentences = [s for s in nltk.tokenize.sent_tokenize(txt)]
    normalized_sentences = [s.lower() for s in sentences]

    words = [w.lower() for sentence in normalized_sentences for w in
             nltk.tokenize.word_tokenize(sentence)]

    fdist = nltk.FreqDist(words)

    top_n_words = [w[0] for w in fdist.items()
            if w[0] not in nltk.corpus.stopwords.words('english')][:n]

    scored_sentences = score_sentences(normalized_sentences, top_n_words)

    # Summarization Approach 1:
    # Filter out nonsignificant sentences by using the average score plus a
    # fraction of the std dev as a filter

    avg = numpy.mean([s[1] for s in scored_sentences])
    std = numpy.std([s[1] for s in scored_sentences])
    mean_scored = [(sent_idx, score) for (sent_idx, score) in scored_sentences
                   if score > avg + 0.5 * std]

    # Summarization Approach 2:
    # Another approach would be to return only the top N ranked sentences

    top_n_scored = sorted(scored_sentences, key=lambda s: s[1])[-top_sentences:]
    top_n_scored = sorted(top_n_scored, key=lambda s: s[0])

    # Decorate the post object with summaries

    return dict(top_n_summary=[sentences[idx] for (idx, score) in top_n_scored],
                mean_scored_summary=[sentences[idx] for (idx, score) in mean_scored])


# Create a model by overriding yhat's BaseModel

class MySummarizerModel(BaseModel):

    def require(self):
        from boilerpipe.extract import Extractor
        import nltk
        import numpy

    def transform(self, d):
        return (d, summarize(url=d),)

    def predict(self, d):
        return { 'url' : d[0], 'summary': d[1] }

if __name__ == '__main__':

    # Get the username, api key, and url from the command line

    USERNAME, APIKEY = sys.argv[1], sys.argv[2]

    # Some sample URLs to try:
    # http://radar.oreilly.com/2013/10/mining-the-social-web-again.html or
    # http://radar.oreilly.com/2013/06/phishing-in-facebooks-pond.html

    URL = sys.argv[3]

    # Create a model to use

    my_summarizer_model = MySummarizerModel(udfs=[summarize])

    # Create an API connection

    yh = Yhat(USERNAME, APIKEY)

    # Upload the model to the yhat server and print the results.
    # Commented out, because it's already uploaded.

    #print yh.upload("MySummarizerModel", my_summarizer_model)

    # How to list available models and versions. (Would be convenient
    # to not have to provide a specific version, which would call the
    # latest available model.)

    #print yh.show_models()

    # Make a prediction with the uploaded model and display the result.

    prediction = yh.raw_predict("MySummarizerModel", 1, URL)

    print json.dumps(prediction, indent=2)

    # You can also use yhat's API to predict with a RESTful endpoint.
    # (BTW, It's unfortunate that you can GET /predict with REST. This
    # particular model is a great example of why that would be super-useful.)
	########################################################################
	#
	# An example of how to deploy a custom predictive model to yhat
	# and "predict" the summary for a news article.
	#
	# Input: URL for a web page containing a news article
	#
	# Output: Summary of the "story" in the web page for the URL
	#
	# Example usage: $ python summarizer.py <username> <apikey> <url>
	#
	########################################################################

	# For getting command line args

	import sys

	# For pretty-printing API responses

	import json

	# These libs are provided by yhat on the cloud server and used
	# in the summarize function below

	import nltk
	import numpy

	# Had to ask for this to be specially installed on the cloud server

	from boilerpipe.extract import Extractor

	# Get this via "pip install yhat"

	from yhat import Yhat, BaseModel


	# This summarize function is taken from Mining the Social Web - http://bit.ly/1dIqdNd

	def summarize(url=None, html=None, n=100, cluster_threshold=5, top_sentences=5):

	# Adapted from "The Automatic Creation of Literature Abstracts" by H.P. Luhn
	#
	# Parameters:
	# * n - Number of words to consider
	# * cluster_threshold - Distance between words to consider
	# * top_sentences - Number of sentences to return for a "top n" summary

	# Begin - nested helper function
	def score_sentences(sentences, important_words):
	scores = []
	sentence_idx = -1

	for s in [nltk.tokenize.word_tokenize(s) for s in sentences]:

	sentence_idx += 1
	word_idx = []

	# For each word in the word list...
	for w in important_words:
	try:
	# Compute an index for important words in each sentence

	word_idx.append(s.index(w))
	except ValueError, e: # w not in this particular sentence
	pass

	word_idx.sort()

	# It is possible that some sentences may not contain any important words
	if len(word_idx)== 0: continue

	# Using the word index, compute clusters with a max distance threshold
	# for any two consecutive words

	clusters = []
	cluster = [word_idx[0]]
	i = 1
	while i < len(word_idx):
	if word_idx[i] - word_idx[i - 1] < cluster_threshold:
	cluster.append(word_idx[i])
	else:
	clusters.append(cluster[:])
	cluster = [word_idx[i]]
	i += 1
	clusters.append(cluster)

	# Score each cluster. The max score for any given cluster is the score
	# for the sentence.

	max_cluster_score = 0
	for c in clusters:
	significant_words_in_cluster = len(c)
	total_words_in_cluster = c[-1] - c[0] + 1
	score = 1.0 * significant_words_in_cluster \
	* significant_words_in_cluster / total_words_in_cluster

	if score > max_cluster_score:
	max_cluster_score = score

	scores.append((sentence_idx, score))

	return scores

	# End - nested helper function

	extractor = Extractor(extractor='ArticleExtractor', url=url, html=html)

	# It's entirely possible that this "clean page" will be a big mess. YMMV.
	# The good news is that the summarize algorithm inherently accounts for handling
	# a lot of this noise.

	txt = extractor.getText()

	sentences = [s for s in nltk.tokenize.sent_tokenize(txt)]
	normalized_sentences = [s.lower() for s in sentences]

	words = [w.lower() for sentence in normalized_sentences for w in
	nltk.tokenize.word_tokenize(sentence)]

	fdist = nltk.FreqDist(words)

	top_n_words = [w[0] for w in fdist.items()
	if w[0] not in nltk.corpus.stopwords.words('english')][:n]

	scored_sentences = score_sentences(normalized_sentences, top_n_words)

	# Summarization Approach 1:
	# Filter out nonsignificant sentences by using the average score plus a
	# fraction of the std dev as a filter

	avg = numpy.mean([s[1] for s in scored_sentences])
	std = numpy.std([s[1] for s in scored_sentences])
	mean_scored = [(sent_idx, score) for (sent_idx, score) in scored_sentences
	if score > avg + 0.5 * std]

	# Summarization Approach 2:
	# Another approach would be to return only the top N ranked sentences

	top_n_scored = sorted(scored_sentences, key=lambda s: s[1])[-top_sentences:]
	top_n_scored = sorted(top_n_scored, key=lambda s: s[0])

	# Decorate the post object with summaries

	return dict(top_n_summary=[sentences[idx] for (idx, score) in top_n_scored],
	mean_scored_summary=[sentences[idx] for (idx, score) in mean_scored])


	# Create a model by overriding yhat's BaseModel

	class MySummarizerModel(BaseModel):

	def require(self):
	from boilerpipe.extract import Extractor
	import nltk
	import numpy

	def transform(self, d):
	return (d, summarize(url=d),)

	def predict(self, d):
	return { 'url' : d[0], 'summary': d[1] }

	if __name__ == '__main__':

	# Get the username, api key, and url from the command line

	USERNAME, APIKEY = sys.argv[1], sys.argv[2]

	# Some sample URLs to try:
	# http://radar.oreilly.com/2013/10/mining-the-social-web-again.html or
	# http://radar.oreilly.com/2013/06/phishing-in-facebooks-pond.html

	URL = sys.argv[3]

	# Create a model to use

	my_summarizer_model = MySummarizerModel(udfs=[summarize])

	# Create an API connection

	yh = Yhat(USERNAME, APIKEY)

	# Upload the model to the yhat server and print the results.
	# Commented out, because it's already uploaded.

	#print yh.upload("MySummarizerModel", my_summarizer_model)

	# How to list available models and versions. (Would be convenient
	# to not have to provide a specific version, which would call the
	# latest available model.)

	#print yh.show_models()

	# Make a prediction with the uploaded model and display the result.

	prediction = yh.raw_predict("MySummarizerModel", 1, URL)

	print json.dumps(prediction, indent=2)

	# You can also use yhat's API to predict with a RESTful endpoint.
	# (BTW, It's unfortunate that you can GET /predict with REST. This
	# particular model is a great example of why that would be super-useful.)