Skip to content

Instantly share code, notes, and snippets.

@inkless
Created April 24, 2013 01:27
Show Gist options
  • Save inkless/5448901 to your computer and use it in GitHub Desktop.
Save inkless/5448901 to your computer and use it in GitHub Desktop.
对文章进行自动摘要(Automatic summarization)的算法实现(python)
# -*- coding: utf-8 -*-
import sys
import json
import nltk
import numpy
N = 100 # Number of words to consider
CLUSTER_THRESHOLD = 5 # Distance between words to consider
TOP_SENTENCES = 5 # Number of sentences to return for a "top n" summary
# Approach taken from "The Automatic Creation of Literature Abstracts" by H.P. Luhn
def _score_sentences(sentences, important_words):
scores = []
sentence_idx = -1
for s in [nltk.tokenize.word_tokenize(s) for s in sentences]:
sentence_idx += 1
word_idx = []
# For each word in the word list...
for w in important_words:
try:
# Compute an index for where any important words occur in the sentence
word_idx.append(s.index(w))
except ValueError, e: # w not in this particular sentence
pass
word_idx.sort()
# It is possible that some sentences may not contain any important words at all
if len(word_idx)== 0: continue
# Using the word index, compute clusters by using a max distance threshold
# for any two consecutive words
clusters = []
cluster = [word_idx[0]]
i = 1
while i < len(word_idx):
if word_idx[i] - word_idx[i - 1] < CLUSTER_THRESHOLD:
cluster.append(word_idx[i])
else:
clusters.append(cluster[:])
cluster = [word_idx[i]]
i += 1
clusters.append(cluster)
# Score each cluster. The max score for any given cluster is the score
# for the sentence
max_cluster_score = 0
for c in clusters:
significant_words_in_cluster = len(c)
total_words_in_cluster = c[-1] - c[0] + 1
score = 1.0 * significant_words_in_cluster \
* significant_words_in_cluster / total_words_in_cluster
if score > max_cluster_score:
max_cluster_score = score
scores.append((sentence_idx, score))
return scores
def summarize(txt):
sentences = [s for s in nltk.tokenize.sent_tokenize(txt)]
normalized_sentences = [s.lower() for s in sentences]
words = [w.lower() for sentence in normalized_sentences for w in
nltk.tokenize.word_tokenize(sentence)]
fdist = nltk.FreqDist(words)
top_n_words = [w[0] for w in fdist.items()
if w[0] not in nltk.corpus.stopwords.words('english')][:N]
scored_sentences = _score_sentences(normalized_sentences, top_n_words)
# Summaization Approach 1:
# Filter out non-significant sentences by using the average score plus a
# fraction of the std dev as a filter
avg = numpy.mean([s[1] for s in scored_sentences])
std = numpy.std([s[1] for s in scored_sentences])
mean_scored = [(sent_idx, score) for (sent_idx, score) in scored_sentences
if score > avg + 0.5 * std]
# Summarization Approach 2:
# Another approach would be to return only the top N ranked sentences
top_n_scored = sorted(scored_sentences, key=lambda s: s[1])[-TOP_SENTENCES:]
top_n_scored = sorted(top_n_scored, key=lambda s: s[0])
# Decorate the post object with summaries
return dict(top_n_summary=[sentences[idx] for (idx, score) in top_n_scored],
mean_scored_summary=[sentences[idx] for (idx, score) in mean_scored])
if __name__ == '__main__':
# Load in output from blogs_and_nlp__get_feed.py
BLOG_DATA = sys.argv[1]
blog_data = json.loads(open(BLOG_DATA).read())
for post in blog_data:
post.update(summarize(post['content']))
print post['title']
print '-' * len(post['title'])
print
print '-------------'
print 'Top N Summary'
print '-------------'
print ' '.join(post['top_n_summary'])
print
print '-------------------'
print 'Mean Scored Summary'
print '-------------------'
print ' '.join(post['mean_scored_summary'])
print
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment