Skip to content

Instantly share code, notes, and snippets.

@lastland
Created August 11, 2012 07:14
Show Gist options
  • Save lastland/3322018 to your computer and use it in GitHub Desktop.
Save lastland/3322018 to your computer and use it in GitHub Desktop.
尝试用这篇post: http://www.matrix67.com/blog/archives/5044 中的方法实现的一个自动中文抽词算法的Python程序
# -*- coding=utf-8 -*-
import collections
# Usage:
# 我的做法是把WordsDetector.py里的结果输出到文件,
# 然后把文件名放到下面的names列表中,运行本程序。
names = ['name0',
'name1',
'name2',
'name3']
words = dict([(i, collections.Counter()) for i in names])
total_words = collections.Counter()
for name in names:
f = open(name)
for line in f:
word, freq = line.split()
words[name][word] += int(freq)
total_words += words[name]
ps = dict([(i, collections.defaultdict(int)) for i in names])
for name in names:
print name
cnt = total = avg = 0.0
for word, freq in words[name].iteritems():
cnt += 1
total += total_words[word]
avg += float(freq) / total_words[word]
total /= cnt
avg /= cnt
avg_times_total = total * avg
for word, freq in words[name].iteritems():
ps[name][word] = (float(freq) + avg_times_total) / (total_words[word] + total)
word_list = list(set(words[name]))
word_list.sort(cmp = lambda x, y: cmp(ps[name][y], ps[name][x]))
cnt = 0
for word in word_list:
print '* ', word, ps[name][word]
cnt += 1
if cnt >= 10: break
# -*- coding=utf-8 -*-
import feedparser
import re
import collections
import math
def info_entropy(words):
result = 0
total = sum([val for _, val in words.iteritems()])
for word, cnt in words.iteritems():
p = float(cnt) / total
result -= p * math.log(p)
return result
max_word_len = 5
entropy_threshold = 1
content = []
articles = feedparser.parse('http://www.liyaos.com/blog/feed')
for article in articles.entries:
content.append(article.title)
content.extend(re.split('<.*?>|&nbsp;', article.description, 0, re.UNICODE))
# replace above line with this if using ATOM:
# try:
# s = article.content[0]['value']
# except AttributeError:
# try:
# s = article.summary
# except AttributeError:
# s = ''
# content.extend(re.split('<.*?>|&nbsp;', s, 0, re.UNICODE))
content = u''.join(content)
sentences = re.split("\W+|[a-zA-Z0-9]+", content, 0, re.UNICODE)
freq = collections.Counter()
for sentence in sentences:
if sentence:
l = len(sentence)
wl = min(l, max_word_len)
for i in range(1, wl + 1):
for j in range(0, l - i + 1):
freq[sentence[j:j + i]] += 1
total = sum([val for _, val in freq.iteritems()])
ps = collections.defaultdict(int)
for word, val in freq.iteritems():
ps[word] = float(val) / total
words = set()
for word, word_p in ps.items():
if len(word) > 1:
p = 0
for i in range(1, len(word)):
t = ps[word[0:i]] * ps[word[i:]]
p = max(p, t)
if freq[word] >= 3 and word_p / p > 100:
words.add(word)
final_words = set()
for word in words:
lf = rf = True
left_words = collections.Counter()
right_words = collections.Counter()
pattern = re.compile(word.join(['.?', '.?']))
for sentence in sentences:
l = pattern.findall(sentence)
if l:
if l[0][0] != word[0]:
left_words[l[0][0]] += 1
else:
lf = False
if l[0][-1] != word[-1]:
right_words[l[0][-1]] += 1
else:
rf = False
left_info_entropy = info_entropy(left_words)
right_info_entropy = info_entropy(right_words)
if lf and len(left_words) > 0 and left_info_entropy < entropy_threshold:
continue
if rf and len(right_words) > 0 and right_info_entropy < entropy_threshold:
continue
final_words.add(word)
words_list = list(final_words)
words_list.sort(cmp = lambda x, y: cmp(freq[y], freq[x]))
for word in words_list:
print word.encode('utf8'), freq[word]
@alpha360x
Copy link

dbl!!

@ZhanruiLiang
Copy link

程序写得非常暴力啊,目测会很慢。开始看见range以为是写的python3,然后看到print语句发现其实还是python2. 还是用xrange好一点。另外total的这种计算方法明显太浪费了。要用sum也可以用generator而不用真的生成整个列表。虽然说现在硬件便宜也不用这样的。

@lastland
Copy link
Author

@ZhanruiLiang 你说得对,这种写法是挺暴力的。当时急于实现就有点不修边幅。
我只学过python2还没搞过python3,用range没用xrange是没养成好习惯,这个的确用xrange好一点,即使急于实现也不该这么整的;total其实是可以在之前的循环里算出来的,但那个式子我刚推了一半突然想到可以sum出来就懒病发作了……非常感谢你提出这些建议,我待会儿就去修改一下。
这个程序还有一个导致它会很慢的地方是计算信息熵的地方有二重循环,这个也应该是有办法优化的,将来有机会我还希望再对这个程序的结构和算法好好改进下,也欢迎所有人继续提出各种意见或建议来。 :D

@leekuhle
Copy link

ps[word[0:i]] * ps[word[i:]] 这一部分是计算什么?

@tina437213
Copy link

的确 当文件很大时(比如20M)在信息熵部分循环 非常慢 而且 为什么 我在写入文件时 为什么会出现这个错误TypeError : not all arguments converted during string formatting

fileobj = open('gugong_word2.txt' % 'wb')
for word in words_list:
print word.encode('utf-8','ignore'), freq[word]
word_freq = '%s\t%d\n' % (word.encode('utf-8','ignore'), freq[word])
fileobj.write(word_freq)
fileobj.close()

TypeError : not all arguments converted during string formatting

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment