Skip to content

Instantly share code, notes, and snippets.

@infinityfuture
Last active February 15, 2019 16:01
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save infinityfuture/bed6fc5592a65c793aa60135b267f0a3 to your computer and use it in GitHub Desktop.
Save infinityfuture/bed6fc5592a65c793aa60135b267f0a3 to your computer and use it in GitHub Desktop.
TextRank extract keywords
"""
Reference:
http://www.hankcs.com/nlp/textrank-algorithm-to-extract-the-keywords-java-implementation.html
http://www.hankcs.com/nlp/textrank-algorithm-java-implementation-of-automatic-abstract.html
"""
import numpy as np
def similar(word_a, word_b):
return 1
def new_ws(i, word_i, ws, word_near, d=0.85):
s = 0
for j, word_j in enumerate(word_near[word_i]):
if word_j == word_i:
continue
w_j_i = similar(word_j, word_i)
w_sum_j_k = 0
for word_k in word_near[word_j]:
if word_k == word_j:
continue
w_sum_j_k += similar(word_j, word_k)
s += w_j_i / w_sum_j_k * ws[j]
s = (1 - d) + d * s
return s
words = [
'程序员',
'英文',
'程序',
'开发',
'维护',
'专业',
'人员',
'程序员',
'分为',
'程序',
'设计',
'人员',
'程序',
'编码',
'人员',
'界限',
'特别',
'中国',
'软件',
'人员',
'分为',
'程序员',
'高级',
'程序员',
'系统',
'分析员',
'项目',
'经理'
]
word_near = {}
for i, word in enumerate(words):
if word not in word_near:
word_near[word] = set()
start_ind = max(i - 5, 0)
end_ind = min(i + 5, len(words))
for j in range(start_ind, end_ind):
if words[j] != word:
word_near[word].add(words[j])
words = sorted(list(word_near.keys()))
weight = np.ones(len(words))
max_iter = 200
tol = 1e-3
for i in range(max_iter):
new_weight = np.array([
new_ws(i, word_i, weight, word_near)
for i, word_i in enumerate(words)
])
if np.sum((weight - new_weight) ** 2) < tol:
break
weight = new_weight
print(sorted(list(zip(words, weight)), key=lambda x: x[1], reverse=True))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment