YoshihitoAso/naivebayes.py

## naivebayes.py
# coding: utf-8

import MeCab
import math

class Bayesian(object):
    term_count = {}
    cat_count = {}

    def __init__(self):
        pass

    def MecabSpliter(self, text):
        m = MeCab.Tagger("-Ochasen")
        n = m.parseToNode(text)
        words = {}
        while n.next is not None:
            if ((n.posid >= 36) or (n.posid >=10 and n.posid <= 12) or (n.posid >= 31 and n.posid <= 33)):
                words.setdefault(n.surface.lower(), 0)
                words[n.surface.lower()] += 1
                #print str(n.posid) + "\t" + str(n.surface) + "\t" + str(n.feature)
            n = n.next
        return words

    def text2vec(self, text, spliter=None):
        result = self.MecabSpliter(text)
        return result

    def train(self, wv, cat):
        for term, cnt in wv.iteritems():
            #print term, cnt
            if not self.term_count.has_key(term):
                self.term_count[term] = {}
            if term in self.term_count[term]:
                self.term_count[term][cat] += cnt
            else:
                self.term_count[term][cat] = cnt

            if term in self.cat_count:
                self.cat_count[cat] += 1
            else:
                self.cat_count[cat] = 1

    def catProb(self, cat):
        total = 0
        for k in self.cat_count:
            total += self.cat_count[k]
        return 1.0 * self.cat_count[cat] / total

    def termCnt(self, term, cat):
        if term in self.term_count:
            if cat in self.term_count[term]:
                return self.term_count[term][cat]
            else:
                return 0
        else:
            return 0

    def termProb(self, term, cat):
        return self.termCnt(term, cat) / self.cat_count[cat]

    def predict(self, vec):
        scores = {}
        for cat in self.cat_count:
            scores[cat] = self.score(vec, cat)
        return scores

    def score(self, vec, cat):
        #print "tt", self.catProb(cat)
        cat_prob = math.log(self.catProb(cat) * 1.0)
        not_likely = 1.0 / (self.total_term_count() * 10)
        doc_prob = 0
        for term, count in vec.items():
            #print term, count
            doc_prob += math.log(self.termProb(term, cat) or not_likely) * count
        return cat_prob + doc_prob

    def total_term_count(self):
        cnt = 0
        for k in self.cat_count:
            cnt += self.cat_count[k]
        return cnt

if __name__ == '__main__':
    filter = Bayesian()

    inputs = [
        {"text":"宅麺を食べた。",                                               "judge":"ramen"},
        {"text":"カップラーメンが美味しかった。",                               "judge":"ramen"},
        {"text":"味噌ラーメン美味しい。",                                       "judge":"ramen"},
        {"text":"家で袋ラーメンを作った。",                                     "judge":"ramen"},
        {"text":"宅麺について考えている。",                                     "judge":"ramen"},
        {"text":"富士そばは美味しい。",                                         "judge":"soba"},
        {"text":"富士そばでカレー蕎麦食べた。",                                 "judge":"soba"},
        {"text":"今日も富士そばである。",                                       "judge":"soba"},
        {"text":"富士そばは演歌がかかっている。",                               "judge":"soba"},
        {"text":"あなたのおそばに、富士そば",                                   "judge":"soba"},
        {"text":"富士そばはダイタングループだ。",                               "judge":"soba"},
        {"text":"名代富士そば。",                                               "judge":"soba"},
        {"text":"浅草で蕎麦を食べた。",                                         "judge":"soba"},
        {"text":"名代　富士そば　大宮東口店６月５日(火)新規開店！",             "judge":"soba"},
    ]
    for input in inputs:
        words = filter.text2vec(input['text'])
        filter.train(words, input['judge'])

    print "============"
    srcs = [
        '味噌ラーメンを食べたいなぁ。',
        'カップラーメンの工場見学に行った。',
        '富士そばについて考えている。',
        '蕎麦食いたい。',
        'ラーメン食いたい。',
        '関西にも富士そばを出店すべき。',
        '富士そばのfacebookページを見ている。',
        '海外生活が続くと富士そばが食べたくなる。',
        '富士そばの出汁について考えている。',
    ]

    for src in srcs:
        words = filter.text2vec(src)
        porb = filter.predict(words)
        is_first = True
        judge_result = {}
        for k,v in sorted(porb.items(), key=lambda x:x[1], reverse=True):
            if is_first == True:
                judge_result["tag"] = k
                judge_result["score"] = v
                is_first = False
        print str(judge_result["tag"]) + ", " + str(judge_result["score"]) + ", " + src
	# coding: utf-8

	import MeCab
	import math

	class Bayesian(object):
	term_count = {}
	cat_count = {}

	def __init__(self):
	pass

	def MecabSpliter(self, text):
	m = MeCab.Tagger("-Ochasen")
	n = m.parseToNode(text)
	words = {}
	while n.next is not None:
	if ((n.posid >= 36) or (n.posid >=10 and n.posid <= 12) or (n.posid >= 31 and n.posid <= 33)):
	words.setdefault(n.surface.lower(), 0)
	words[n.surface.lower()] += 1
	#print str(n.posid) + "\t" + str(n.surface) + "\t" + str(n.feature)
	n = n.next
	return words

	def text2vec(self, text, spliter=None):
	result = self.MecabSpliter(text)
	return result

	def train(self, wv, cat):
	for term, cnt in wv.iteritems():
	#print term, cnt
	if not self.term_count.has_key(term):
	self.term_count[term] = {}
	if term in self.term_count[term]:
	self.term_count[term][cat] += cnt
	else:
	self.term_count[term][cat] = cnt

	if term in self.cat_count:
	self.cat_count[cat] += 1
	else:
	self.cat_count[cat] = 1

	def catProb(self, cat):
	total = 0
	for k in self.cat_count:
	total += self.cat_count[k]
	return 1.0 * self.cat_count[cat] / total

	def termCnt(self, term, cat):
	if term in self.term_count:
	if cat in self.term_count[term]:
	return self.term_count[term][cat]
	else:
	return 0
	else:
	return 0

	def termProb(self, term, cat):
	return self.termCnt(term, cat) / self.cat_count[cat]

	def predict(self, vec):
	scores = {}
	for cat in self.cat_count:
	scores[cat] = self.score(vec, cat)
	return scores

	def score(self, vec, cat):
	#print "tt", self.catProb(cat)
	cat_prob = math.log(self.catProb(cat) * 1.0)
	not_likely = 1.0 / (self.total_term_count() * 10)
	doc_prob = 0
	for term, count in vec.items():
	#print term, count
	doc_prob += math.log(self.termProb(term, cat) or not_likely) * count
	return cat_prob + doc_prob

	def total_term_count(self):
	cnt = 0
	for k in self.cat_count:
	cnt += self.cat_count[k]
	return cnt

	if __name__ == '__main__':
	filter = Bayesian()

	inputs = [
	{"text":"宅麺を食べた。", "judge":"ramen"},
	{"text":"カップラーメンが美味しかった。", "judge":"ramen"},
	{"text":"味噌ラーメン美味しい。", "judge":"ramen"},
	{"text":"家で袋ラーメンを作った。", "judge":"ramen"},
	{"text":"宅麺について考えている。", "judge":"ramen"},
	{"text":"富士そばは美味しい。", "judge":"soba"},
	{"text":"富士そばでカレー蕎麦食べた。", "judge":"soba"},
	{"text":"今日も富士そばである。", "judge":"soba"},
	{"text":"富士そばは演歌がかかっている。", "judge":"soba"},
	{"text":"あなたのおそばに、富士そば", "judge":"soba"},
	{"text":"富士そばはダイタングループだ。", "judge":"soba"},
	{"text":"名代富士そば。", "judge":"soba"},
	{"text":"浅草で蕎麦を食べた。", "judge":"soba"},
	{"text":"名代　富士そば　大宮東口店６月５日(火)新規開店！", "judge":"soba"},
	]
	for input in inputs:
	words = filter.text2vec(input['text'])
	filter.train(words, input['judge'])

	print "============"
	srcs = [
	'味噌ラーメンを食べたいなぁ。',
	'カップラーメンの工場見学に行った。',
	'富士そばについて考えている。',
	'蕎麦食いたい。',
	'ラーメン食いたい。',
	'関西にも富士そばを出店すべき。',
	'富士そばのfacebookページを見ている。',
	'海外生活が続くと富士そばが食べたくなる。',
	'富士そばの出汁について考えている。',
	]

	for src in srcs:
	words = filter.text2vec(src)
	porb = filter.predict(words)
	is_first = True
	judge_result = {}
	for k,v in sorted(porb.items(), key=lambda x:x[1], reverse=True):
	if is_first == True:
	judge_result["tag"] = k
	judge_result["score"] = v
	is_first = False
	print str(judge_result["tag"]) + ", " + str(judge_result["score"]) + ", " + src