Skip to content

Instantly share code, notes, and snippets.

@infinityfuture
Created October 4, 2018 08:08
Show Gist options
  • Save infinityfuture/67d70f5e90048b347854a2033cbe367a to your computer and use it in GitHub Desktop.
Save infinityfuture/67d70f5e90048b347854a2033cbe367a to your computer and use it in GitHub Desktop.
TextRank extract keywords using word2vec as similarity
"""
Reference:
http://www.hankcs.com/nlp/textrank-algorithm-to-extract-the-keywords-java-implementation.html
http://www.hankcs.com/nlp/textrank-algorithm-java-implementation-of-automatic-abstract.html
Chinese Embedding From
https://github.com/Embedding/Chinese-Word-Vectors
"""
import gensim
import numpy as np
def similar(word_a, word_b, model):
return model.similarity(word_a, word_b)
def new_ws(i, word_i, ws, word_near, model, d=0.85):
s = 0
for j, word_j in enumerate(word_near[word_i]):
if word_j == word_i:
continue
w_j_i = similar(word_j, word_i, model)
w_sum_j_k = 0
for word_k in word_near[word_j]:
if word_k == word_j:
continue
w_sum_j_k += similar(word_j, word_k, model)
s += w_j_i / w_sum_j_k * ws[j]
s = (1 - d) + d * s
return s
model = gensim.models.KeyedVectors.load_word2vec_format(
'sgns.baidubaike.bigram-char', binary=False)
words = [
'程序员',
'英文',
'程序',
'开发',
'维护',
'专业',
'人员',
'程序员',
'分为',
'程序',
'设计',
'人员',
'程序',
'编码',
'人员',
'界限',
'特别',
'中国',
'软件',
'人员',
'分为',
'程序员',
'高级',
'程序员',
'系统',
'分析员',
'项目',
'经理'
]
word_near = {}
for i, word in enumerate(words):
if word not in word_near:
word_near[word] = set()
start_ind = max(i - 5, 0)
end_ind = min(i + 5, len(words))
for j in range(start_ind, end_ind):
if words[j] != word:
word_near[word].add(words[j])
words = sorted(list(word_near.keys()))
weight = np.ones(len(words))
max_iter = 200
tol = 1e-3
for i in range(max_iter):
new_weight = np.array([
new_ws(i, word_i, weight, word_near, model)
for i, word_i in enumerate(words)
])
if np.sum((weight - new_weight) ** 2) < tol:
break
weight = new_weight
print(sorted(list(zip(words, weight)), key=lambda x: x[1], reverse=True))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment