Skip to content

Instantly share code, notes, and snippets.

Created December 1, 2013 01:25
Show Gist options
  • Save ptwobrussell/7727485 to your computer and use it in GitHub Desktop.
Save ptwobrussell/7727485 to your computer and use it in GitHub Desktop.
An example of how to use yhat's cloud server to "predict" summaries of news articles.
# An example of how to deploy a custom predictive model to yhat
# and "predict" the summary for a news article.
# Input: URL for a web page containing a news article
# Output: Summary of the "story" in the web page for the URL
# Example usage: $ python <username> <apikey> <url>
# For getting command line args
import sys
# For pretty-printing API responses
import json
# These libs are provided by yhat on the cloud server and used
# in the summarize function below
import nltk
import numpy
# Had to ask for this to be specially installed on the cloud server
from boilerpipe.extract import Extractor
# Get this via "pip install yhat"
from yhat import Yhat, BaseModel
# This summarize function is taken from Mining the Social Web -
def summarize(url=None, html=None, n=100, cluster_threshold=5, top_sentences=5):
# Adapted from "The Automatic Creation of Literature Abstracts" by H.P. Luhn
# Parameters:
# * n - Number of words to consider
# * cluster_threshold - Distance between words to consider
# * top_sentences - Number of sentences to return for a "top n" summary
# Begin - nested helper function
def score_sentences(sentences, important_words):
scores = []
sentence_idx = -1
for s in [nltk.tokenize.word_tokenize(s) for s in sentences]:
sentence_idx += 1
word_idx = []
# For each word in the word list...
for w in important_words:
# Compute an index for important words in each sentence
except ValueError, e: # w not in this particular sentence
# It is possible that some sentences may not contain any important words
if len(word_idx)== 0: continue
# Using the word index, compute clusters with a max distance threshold
# for any two consecutive words
clusters = []
cluster = [word_idx[0]]
i = 1
while i < len(word_idx):
if word_idx[i] - word_idx[i - 1] < cluster_threshold:
cluster = [word_idx[i]]
i += 1
# Score each cluster. The max score for any given cluster is the score
# for the sentence.
max_cluster_score = 0
for c in clusters:
significant_words_in_cluster = len(c)
total_words_in_cluster = c[-1] - c[0] + 1
score = 1.0 * significant_words_in_cluster \
* significant_words_in_cluster / total_words_in_cluster
if score > max_cluster_score:
max_cluster_score = score
scores.append((sentence_idx, score))
return scores
# End - nested helper function
extractor = Extractor(extractor='ArticleExtractor', url=url, html=html)
# It's entirely possible that this "clean page" will be a big mess. YMMV.
# The good news is that the summarize algorithm inherently accounts for handling
# a lot of this noise.
txt = extractor.getText()
sentences = [s for s in nltk.tokenize.sent_tokenize(txt)]
normalized_sentences = [s.lower() for s in sentences]
words = [w.lower() for sentence in normalized_sentences for w in
fdist = nltk.FreqDist(words)
top_n_words = [w[0] for w in fdist.items()
if w[0] not in nltk.corpus.stopwords.words('english')][:n]
scored_sentences = score_sentences(normalized_sentences, top_n_words)
# Summarization Approach 1:
# Filter out nonsignificant sentences by using the average score plus a
# fraction of the std dev as a filter
avg = numpy.mean([s[1] for s in scored_sentences])
std = numpy.std([s[1] for s in scored_sentences])
mean_scored = [(sent_idx, score) for (sent_idx, score) in scored_sentences
if score > avg + 0.5 * std]
# Summarization Approach 2:
# Another approach would be to return only the top N ranked sentences
top_n_scored = sorted(scored_sentences, key=lambda s: s[1])[-top_sentences:]
top_n_scored = sorted(top_n_scored, key=lambda s: s[0])
# Decorate the post object with summaries
return dict(top_n_summary=[sentences[idx] for (idx, score) in top_n_scored],
mean_scored_summary=[sentences[idx] for (idx, score) in mean_scored])
# Create a model by overriding yhat's BaseModel
class MySummarizerModel(BaseModel):
def require(self):
from boilerpipe.extract import Extractor
import nltk
import numpy
def transform(self, d):
return (d, summarize(url=d),)
def predict(self, d):
return { 'url' : d[0], 'summary': d[1] }
if __name__ == '__main__':
# Get the username, api key, and url from the command line
USERNAME, APIKEY = sys.argv[1], sys.argv[2]
# Some sample URLs to try:
# or
URL = sys.argv[3]
# Create a model to use
my_summarizer_model = MySummarizerModel(udfs=[summarize])
# Create an API connection
# Upload the model to the yhat server and print the results.
# Commented out, because it's already uploaded.
#print yh.upload("MySummarizerModel", my_summarizer_model)
# How to list available models and versions. (Would be convenient
# to not have to provide a specific version, which would call the
# latest available model.)
#print yh.show_models()
# Make a prediction with the uploaded model and display the result.
prediction = yh.raw_predict("MySummarizerModel", 1, URL)
print json.dumps(prediction, indent=2)
# You can also use yhat's API to predict with a RESTful endpoint.
# (BTW, It's unfortunate that you can GET /predict with REST. This
# particular model is a great example of why that would be super-useful.)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment