Skip to content

Instantly share code, notes, and snippets.

What would you like to do?
TextRank algorithm for text summarization.
from itertools import combinations
from nltk.tokenize import sent_tokenize, RegexpTokenizer
from nltk.stem.snowball import RussianStemmer
import networkx as nx
def similarity(s1, s2):
if not len(s1) or not len(s2):
return 0.0
return len(s1.intersection(s2))/(1.0 * (len(s1) + len(s2)))
def textrank(text):
sentences = sent_tokenize(text)
tokenizer = RegexpTokenizer(r'\w+')
lmtzr = RussianStemmer()
words = [set(lmtzr.stem(word) for word in tokenizer.tokenize(sentence.lower()))
for sentence in sentences]
pairs = combinations(range(len(sentences)), 2)
scores = [(i, j, similarity(words[i], words[j])) for i, j in pairs]
scores = filter(lambda x: x[2], scores)
g = nx.Graph()
pr = nx.pagerank(g)
return sorted(((i, pr[i], s) for i, s in enumerate(sentences) if i in pr), key=lambda x: pr[x[0]], reverse=True)
def extract(text, n=5):
tr = textrank(text)
top_n = sorted(tr[:n])
return ' '.join(x[2] for x in top_n)

This comment has been minimized.

Copy link
Owner Author

@igor-shevchenko igor-shevchenko commented Jun 20, 2013

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.