Skip to content

Instantly share code, notes, and snippets.

@lastland
Created August 11, 2012 07:14
Show Gist options
  • Save lastland/3322018 to your computer and use it in GitHub Desktop.
Save lastland/3322018 to your computer and use it in GitHub Desktop.
尝试用这篇post: http://www.matrix67.com/blog/archives/5044 中的方法实现的一个自动中文抽词算法的Python程序
# -*- coding=utf-8 -*-
import collections
# Usage:
# 我的做法是把WordsDetector.py里的结果输出到文件,
# 然后把文件名放到下面的names列表中,运行本程序。
names = ['name0',
'name1',
'name2',
'name3']
words = dict([(i, collections.Counter()) for i in names])
total_words = collections.Counter()
for name in names:
f = open(name)
for line in f:
word, freq = line.split()
words[name][word] += int(freq)
total_words += words[name]
ps = dict([(i, collections.defaultdict(int)) for i in names])
for name in names:
print name
cnt = total = avg = 0.0
for word, freq in words[name].iteritems():
cnt += 1
total += total_words[word]
avg += float(freq) / total_words[word]
total /= cnt
avg /= cnt
avg_times_total = total * avg
for word, freq in words[name].iteritems():
ps[name][word] = (float(freq) + avg_times_total) / (total_words[word] + total)
word_list = list(set(words[name]))
word_list.sort(cmp = lambda x, y: cmp(ps[name][y], ps[name][x]))
cnt = 0
for word in word_list:
print '* ', word, ps[name][word]
cnt += 1
if cnt >= 10: break
# -*- coding=utf-8 -*-
import feedparser
import re
import collections
import math
def info_entropy(words):
result = 0
total = sum([val for _, val in words.iteritems()])
for word, cnt in words.iteritems():
p = float(cnt) / total
result -= p * math.log(p)
return result
max_word_len = 5
entropy_threshold = 1
content = []
articles = feedparser.parse('http://www.liyaos.com/blog/feed')
for article in articles.entries:
content.append(article.title)
content.extend(re.split('<.*?>|&nbsp;', article.description, 0, re.UNICODE))
# replace above line with this if using ATOM:
# try:
# s = article.content[0]['value']
# except AttributeError:
# try:
# s = article.summary
# except AttributeError:
# s = ''
# content.extend(re.split('<.*?>|&nbsp;', s, 0, re.UNICODE))
content = u''.join(content)
sentences = re.split("\W+|[a-zA-Z0-9]+", content, 0, re.UNICODE)
freq = collections.Counter()
for sentence in sentences:
if sentence:
l = len(sentence)
wl = min(l, max_word_len)
for i in range(1, wl + 1):
for j in range(0, l - i + 1):
freq[sentence[j:j + i]] += 1
total = sum([val for _, val in freq.iteritems()])
ps = collections.defaultdict(int)
for word, val in freq.iteritems():
ps[word] = float(val) / total
words = set()
for word, word_p in ps.items():
if len(word) > 1:
p = 0
for i in range(1, len(word)):
t = ps[word[0:i]] * ps[word[i:]]
p = max(p, t)
if freq[word] >= 3 and word_p / p > 100:
words.add(word)
final_words = set()
for word in words:
lf = rf = True
left_words = collections.Counter()
right_words = collections.Counter()
pattern = re.compile(word.join(['.?', '.?']))
for sentence in sentences:
l = pattern.findall(sentence)
if l:
if l[0][0] != word[0]:
left_words[l[0][0]] += 1
else:
lf = False
if l[0][-1] != word[-1]:
right_words[l[0][-1]] += 1
else:
rf = False
left_info_entropy = info_entropy(left_words)
right_info_entropy = info_entropy(right_words)
if lf and len(left_words) > 0 and left_info_entropy < entropy_threshold:
continue
if rf and len(right_words) > 0 and right_info_entropy < entropy_threshold:
continue
final_words.add(word)
words_list = list(final_words)
words_list.sort(cmp = lambda x, y: cmp(freq[y], freq[x]))
for word in words_list:
print word.encode('utf8'), freq[word]
@leekuhle
Copy link

ps[word[0:i]] * ps[word[i:]] 这一部分是计算什么?

@tina437213
Copy link

的确 当文件很大时(比如20M)在信息熵部分循环 非常慢 而且 为什么 我在写入文件时 为什么会出现这个错误TypeError : not all arguments converted during string formatting

fileobj = open('gugong_word2.txt' % 'wb')
for word in words_list:
print word.encode('utf-8','ignore'), freq[word]
word_freq = '%s\t%d\n' % (word.encode('utf-8','ignore'), freq[word])
fileobj.write(word_freq)
fileobj.close()

TypeError : not all arguments converted during string formatting

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment