Created
June 29, 2014 03:13
-
-
Save yubessy/7c590984bb19b89b8c6d to your computer and use it in GitHub Desktop.
TextRank
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
# third-party lib | |
import networkx | |
def blank_split(text): | |
u""" | |
半角スペースによる単語分割 | |
""" | |
return text.split() | |
def ngram(words, window=2): | |
u""" | |
単語のリストからN-gramのリストを作成 | |
""" | |
result = list() | |
for i in range(len(words) - window + 1): | |
result.append(tuple(words[i:i+window])) | |
return result | |
def combination(words): | |
u""" | |
単語のリストから単語の全組み合わせのリストを作成 | |
""" | |
return list(set([(x, y) for x in words for y in words if x > y])) | |
def textgraph(text, tokenizer=blank_split, window=2): | |
u""" | |
テキストグラフを作成 | |
""" | |
words = tokenizer(text) | |
graph = networkx.Graph() | |
ng = ngram(words, window) | |
for ng_unit in ng: | |
edges = combination(ng_unit) | |
graph.add_edges_from(edges) | |
return graph | |
def textrank(graph): | |
u""" | |
グラフに対してPageRankを計算 | |
""" | |
return networkx.pagerank(graph) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment