VikParuchuri/textrank.py

## textrank.py
import networkx as nx
from nltk.tokenize.punkt import PunktSentenceTokenizer
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer

SENTENCES_IN_SUMMARY = 10
MIN_SENTENCE_LENGTH = 50
MAX_SENTENCE_LENGTH = 200

def f7(seq):
    seen = set()
    seen_add = seen.add
    return [ x for x in seq if x not in seen and not seen_add(x)]

def textrank(article):
    sentence_tokenizer = PunktSentenceTokenizer()
    sentences = sentence_tokenizer.tokenize(article)

    subbed_sentences = [s.lower() for s in sentences]
    bow_matrix = CountVectorizer().fit_transform(subbed_sentences)
    normalized = TfidfTransformer().fit_transform(bow_matrix)

    similarity_graph = normalized * normalized.T

    nx_graph = nx.from_scipy_sparse_matrix(similarity_graph)
    scores = nx.pagerank(nx_graph)
    sorted_scores = sorted(((scores[i],s) for i,s in enumerate(sentences)),
                  reverse=True)
    sorted_scores = [s for s in sorted_scores if len(s[1])>MIN_SENTENCE_LENGTH and len(s[1])<MAX_SENTENCE_LENGTH]
    return "  ".join(f7([sorted_scores[i][1] for i in range(settings.SENTENCES_IN_SUMMARY)]))
	import networkx as nx
	from nltk.tokenize.punkt import PunktSentenceTokenizer
	from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer

	SENTENCES_IN_SUMMARY = 10
	MIN_SENTENCE_LENGTH = 50
	MAX_SENTENCE_LENGTH = 200

	def f7(seq):
	seen = set()
	seen_add = seen.add
	return [ x for x in seq if x not in seen and not seen_add(x)]

	def textrank(article):
	sentence_tokenizer = PunktSentenceTokenizer()
	sentences = sentence_tokenizer.tokenize(article)

	subbed_sentences = [s.lower() for s in sentences]
	bow_matrix = CountVectorizer().fit_transform(subbed_sentences)
	normalized = TfidfTransformer().fit_transform(bow_matrix)

	similarity_graph = normalized * normalized.T

	nx_graph = nx.from_scipy_sparse_matrix(similarity_graph)
	scores = nx.pagerank(nx_graph)
	sorted_scores = sorted(((scores[i],s) for i,s in enumerate(sentences)),
	reverse=True)
	sorted_scores = [s for s in sorted_scores if len(s[1])>MIN_SENTENCE_LENGTH and len(s[1])<MAX_SENTENCE_LENGTH]
	return " ".join(f7([sorted_scores[i][1] for i in range(settings.SENTENCES_IN_SUMMARY)]))