lastland/BeyesianAvg.py

## BeyesianAvg.py
# -*- coding=utf-8 -*-
import collections

# Usage:
#   我的做法是把WordsDetector.py里的结果输出到文件，
#   然后把文件名放到下面的names列表中，运行本程序。

names = ['name0',
         'name1',
         'name2',
         'name3']

words = dict([(i, collections.Counter()) for i in names])
total_words = collections.Counter()

for name in names:
    f = open(name)
    for line in f:
        word, freq = line.split()
        words[name][word] += int(freq)
    total_words += words[name]

ps = dict([(i, collections.defaultdict(int)) for i in names])

for name in names:
    print name
    cnt = total = avg = 0.0
    for word, freq in words[name].iteritems():
        cnt += 1
        total += total_words[word]
        avg += float(freq) / total_words[word]
    total /= cnt
    avg /= cnt
    avg_times_total = total * avg
    for word, freq in words[name].iteritems():
        ps[name][word] = (float(freq) + avg_times_total) / (total_words[word] + total)
    word_list = list(set(words[name]))
    word_list.sort(cmp = lambda x, y: cmp(ps[name][y], ps[name][x]))
    cnt = 0
    for word in word_list:
        print '* ', word, ps[name][word]
        cnt += 1
        if cnt >= 10: break

## WordsDetector.py
# -*- coding=utf-8 -*-
import feedparser
import re
import collections
import math

def info_entropy(words):
    result = 0
    total = sum([val for _, val in words.iteritems()])
    for word, cnt in words.iteritems():
        p = float(cnt) / total
        result -= p * math.log(p)
    return result

max_word_len = 5
entropy_threshold = 1

content = []
articles = feedparser.parse('http://www.liyaos.com/blog/feed')
for article in articles.entries:
    content.append(article.title)
    content.extend(re.split('<.*?>|&nbsp;', article.description, 0, re.UNICODE))
# replace above line with this if using ATOM:
#   try:
#       s = article.content[0]['value']
#   except AttributeError:
#       try:
#           s = article.summary
#       except AttributeError:
#           s = ''
#   content.extend(re.split('<.*?>|&nbsp;', s, 0, re.UNICODE))
content = u''.join(content)
sentences = re.split("\W+|[a-zA-Z0-9]+", content, 0, re.UNICODE)
freq = collections.Counter()
for sentence in sentences:
    if sentence:
        l = len(sentence)
        wl = min(l, max_word_len)
        for i in range(1, wl + 1):
            for j in range(0, l - i + 1):
                freq[sentence[j:j + i]] += 1
total = sum([val for _, val in freq.iteritems()])
ps = collections.defaultdict(int)
for word, val in freq.iteritems():
    ps[word] = float(val) / total

words = set()
for word, word_p in ps.items():
    if len(word) > 1:
        p = 0
        for i in range(1, len(word)):
            t = ps[word[0:i]] * ps[word[i:]]
            p = max(p, t)
        if freq[word] >= 3 and word_p / p > 100:
            words.add(word)

final_words = set()
for word in words:
    lf = rf = True
    left_words = collections.Counter()
    right_words = collections.Counter()
    pattern = re.compile(word.join(['.?', '.?']))
    for sentence in sentences:
        l = pattern.findall(sentence)
        if l:
            if l[0][0] != word[0]:
                left_words[l[0][0]] += 1
            else:
                lf = False
            if l[0][-1] != word[-1]:
                right_words[l[0][-1]] += 1
            else:
                rf = False
    left_info_entropy = info_entropy(left_words)
    right_info_entropy = info_entropy(right_words)
    if lf and len(left_words) > 0 and left_info_entropy < entropy_threshold:
        continue
    if rf and len(right_words) > 0 and right_info_entropy < entropy_threshold:
        continue
    final_words.add(word)
words_list = list(final_words)
words_list.sort(cmp = lambda x, y: cmp(freq[y], freq[x]))
for word in words_list:
    print word.encode('utf8'), freq[word]
	# -- coding=utf-8 --
	import collections

	# Usage:
	# 我的做法是把WordsDetector.py里的结果输出到文件，
	# 然后把文件名放到下面的names列表中，运行本程序。

	names = ['name0',
	'name1',
	'name2',
	'name3']

	words = dict([(i, collections.Counter()) for i in names])
	total_words = collections.Counter()

	for name in names:
	f = open(name)
	for line in f:
	word, freq = line.split()
	words[name][word] += int(freq)
	total_words += words[name]

	ps = dict([(i, collections.defaultdict(int)) for i in names])

	for name in names:
	print name
	cnt = total = avg = 0.0
	for word, freq in words[name].iteritems():
	cnt += 1
	total += total_words[word]
	avg += float(freq) / total_words[word]
	total /= cnt
	avg /= cnt
	avg_times_total = total * avg
	for word, freq in words[name].iteritems():
	ps[name][word] = (float(freq) + avg_times_total) / (total_words[word] + total)
	word_list = list(set(words[name]))
	word_list.sort(cmp = lambda x, y: cmp(ps[name][y], ps[name][x]))
	cnt = 0
	for word in word_list:
	print '* ', word, ps[name][word]
	cnt += 1
	if cnt >= 10: break
	# -- coding=utf-8 --
	import feedparser
	import re
	import collections
	import math

	def info_entropy(words):
	result = 0
	total = sum([val for _, val in words.iteritems()])
	for word, cnt in words.iteritems():
	p = float(cnt) / total
	result -= p * math.log(p)
	return result

	max_word_len = 5
	entropy_threshold = 1

	content = []
	articles = feedparser.parse('http://www.liyaos.com/blog/feed')
	for article in articles.entries:
	content.append(article.title)
	content.extend(re.split('<.*?>\| ', article.description, 0, re.UNICODE))
	# replace above line with this if using ATOM:
	# try:
	# s = article.content[0]['value']
	# except AttributeError:
	# try:
	# s = article.summary
	# except AttributeError:
	# s = ''
	# content.extend(re.split('<.*?>\| ', s, 0, re.UNICODE))
	content = u''.join(content)
	sentences = re.split("\W+\|[a-zA-Z0-9]+", content, 0, re.UNICODE)
	freq = collections.Counter()
	for sentence in sentences:
	if sentence:
	l = len(sentence)
	wl = min(l, max_word_len)
	for i in range(1, wl + 1):
	for j in range(0, l - i + 1):
	freq[sentence[j:j + i]] += 1
	total = sum([val for _, val in freq.iteritems()])
	ps = collections.defaultdict(int)
	for word, val in freq.iteritems():
	ps[word] = float(val) / total

	words = set()
	for word, word_p in ps.items():
	if len(word) > 1:
	p = 0
	for i in range(1, len(word)):
	t = ps[word[0:i]] * ps[word[i:]]
	p = max(p, t)
	if freq[word] >= 3 and word_p / p > 100:
	words.add(word)

	final_words = set()
	for word in words:
	lf = rf = True
	left_words = collections.Counter()
	right_words = collections.Counter()
	pattern = re.compile(word.join(['.?', '.?']))
	for sentence in sentences:
	l = pattern.findall(sentence)
	if l:
	if l[0][0] != word[0]:
	left_words[l[0][0]] += 1
	else:
	lf = False
	if l[0][-1] != word[-1]:
	right_words[l[0][-1]] += 1
	else:
	rf = False
	left_info_entropy = info_entropy(left_words)
	right_info_entropy = info_entropy(right_words)
	if lf and len(left_words) > 0 and left_info_entropy < entropy_threshold:
	continue
	if rf and len(right_words) > 0 and right_info_entropy < entropy_threshold:
	continue
	final_words.add(word)
	words_list = list(final_words)
	words_list.sort(cmp = lambda x, y: cmp(freq[y], freq[x]))
	for word in words_list:
	print word.encode('utf8'), freq[word]