Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
尝试用这篇post: http://www.matrix67.com/blog/archives/5044 中的方法实现的一个自动中文抽词算法的Python程序
# -*- coding=utf-8 -*-
import feedparser
import re
import collections
import math
def info_entropy(words):
result = 0
total = sum([val for _, val in words.iteritems()])
for word, cnt in words.iteritems():
p = float(cnt) / total
result -= p * math.log(p)
return result
max_word_len = 5
entropy_threshold = 1
content = []
articles = feedparser.parse('http://www.liyaos.com/blog/feed')
for article in articles.entries:
content.append(article.title)
content.extend(re.split('<.*?>|&nbsp;', article.description, 0, re.UNICODE))
content = u''.join(content)
sentences = re.split("\W+|[a-zA-Z0-9]+", content, 0, re.UNICODE)
freq = collections.Counter()
for sentence in sentences:
if sentence:
l = len(sentence)
wl = min(l, max_word_len)
for i in range(1, wl + 1):
for j in range(0, l - i + 1):
freq[sentence[j:j + i]] += 1
total = sum([val for _, val in freq.iteritems()])
ps = collections.defaultdict(int)
for word, val in freq.iteritems():
ps[word] = float(val) / total
words = set()
for word, word_p in ps.items():
if len(word) > 1:
p = 0
for i in range(1, len(word)):
t = ps[word[0:i]] * ps[word[i:]]
p = max(p, t)
if freq[word] >= 3 and word_p / p > 100:
words.add(word)
final_words = set()
for word in words:
left_words = collections.Counter()
right_words = collections.Counter()
pattern = re.compile(word.join(['.?', '.?']))
for sentence in sentences:
l = pattern.findall(sentence)
if l:
if l[0][0] != word[0]:
left_words[l[0][0]] += 1
if l[0][-1] != word[-1]:
right_words[l[0][-1]] += 1
left_info_entropy = info_entropy(left_words)
right_info_entropy = info_entropy(right_words)
if len(left_words) > 0 and left_info_entropy < entropy_threshold:
continue
if len(right_words) > 0 and right_info_entropy < entropy_threshold:
continue
final_words.add(word)
words_list = list(final_words)
words_list.sort(cmp = lambda x, y: cmp(freq[y], freq[x]))
for word in words_list:
print word.encode('utf8'), freq[word]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment