Skip to content

Instantly share code, notes, and snippets.

@infinityfuture
Last active October 4, 2018 08:11
Show Gist options
  • Save infinityfuture/edee3a06184250995fb9905bf2332a8c to your computer and use it in GitHub Desktop.
Save infinityfuture/edee3a06184250995fb9905bf2332a8c to your computer and use it in GitHub Desktop.
Use TextRank algorithm to generate summary
"""
Reference:
http://www.hankcs.com/nlp/textrank-algorithm-to-extract-the-keywords-java-implementation.html
http://www.hankcs.com/nlp/textrank-algorithm-java-implementation-of-automatic-abstract.html
"""
from gensim.summarization.bm25 import get_bm25_weights
import numpy as np
corpus = [
["算法", "大致", "分", "基本", "算法", "数据", "结构", "算法", "数论", "算法", "计算", "几何", "算法", "图", "算法", "动态", "规划", "数值", "分析", "加密", "算法", "排序", "算法", "检索", "算法", "随机", "化", "算法", "并行", "算法", "厄", "米", "变形", "模型", "随机", "森林", "算法"],
["算法", "宽泛", "分为", "三类"],
["有限", "确定性", "算法"],
["类", "算法", "有限", "一段", "时间", "终止"],
["可能", "花", "长", "时间", "执行", "指定", "任务"],
["一定", "时间", "终止"],
["类", "算法", "得出", "常", "取决", "输入", "值"],
["二"],
["有限", "非", "确定", "算法"],
["类", "算法", "有限", "时间", "终止"],
["一个", "定", "数值"],
["算法", "唯一", "确定"],
["三"],
["无限", "算法"],
["没有", "定义", "终止", "定义", "条件"],
["定义", "条件", "无法", "输入", "数据", "满足", "终止", "运行", "算法"],
["通常"],
["无限", "算法", "产生", "未", "确定", "定义", "终止", "条件"]
]
def new_ws(i, word_i, ws, corpus, similarity_matrix, d=0.85):
size = len(weight)
s = 0
for j in range(size):
if j == i: continue
w_j_i = similarity_matrix[j][i]
weight_sum_j = np.sum(similarity_matrix[j])
s += d * w_j_i / weight_sum_j * ws[j]
s = (1 - d) + s
return s
similarity_matrix = get_bm25_weights(corpus)
weight = np.ones(len(corpus))
max_iter = 200
tol = 1e-3
for i in range(max_iter):
print(i)
new_weight = np.array([
new_ws(i, word_i, weight, corpus, similarity_matrix)
for i, word_i in enumerate(corpus)
])
if np.sum((weight - new_weight) ** 2) < tol:
break
weight = new_weight
print(sorted(list(zip(corpus, weight)), key=lambda x: x[1], reverse=True))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment