Skip to content

Instantly share code, notes, and snippets.

@staticor
Created July 27, 2013 03:25
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save staticor/6093569 to your computer and use it in GitHub Desktop.
Save staticor/6093569 to your computer and use it in GitHub Desktop.
# -*- coding=utf-8 -*-
import collections
 
# Usage:
# 我的做法是把WordsDetector.py里的结果输出到文件,
# 然后把文件名放到下面的names列表中,运行本程序。
 
names = ['name0',
'name1',
'name2',
'name3']
 
words = dict([(i, collections.Counter()) for i in names])
total_words = collections.Counter()
 
for name in names:
f = open(name)
for line in f:
word, freq = line.split()
words[name][word] += int(freq)
total_words += words[name]
 
ps = dict([(i, collections.defaultdict(int)) for i in names])
 
for name in names:
print name
cnt = total = avg = 0.0
for word, freq in words[name].iteritems():
cnt += 1
total += total_words[word]
avg += float(freq) / total_words[word]
total /= cnt
avg /= cnt
avg_times_total = total * avg
for word, freq in words[name].iteritems():
ps[name][word] = (float(freq) + avg_times_total) / (total_words[word] + total)
word_list = list(set(words[name]))
word_list.sort(cmp = lambda x, y: cmp(ps[name][y], ps[name][x]))
cnt = 0
for word in word_list:
print '* ', word, ps[name][word]
cnt += 1
if cnt >= 10: break
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment