Skip to content

Instantly share code, notes, and snippets.

@ratsgo
Created July 6, 2017 06:40
Show Gist options
  • Save ratsgo/45d6eb4822ae27b01329e3b8c15c8f98 to your computer and use it in GitHub Desktop.
Save ratsgo/45d6eb4822ae27b01329e3b8c15c8f98 to your computer and use it in GitHub Desktop.
from collections import defaultdict
from pandas import read_table
import numpy as np
import math
class NaiveBayesClassifier:
def __init__(self, k=0.5):
self.k = k
self.word_probs = []
def load_corpus(self, path):
corpus = read_table(path, sep=',', encoding='utf-8')
corpus = np.array(corpus)
return corpus
def count_words(self, training_set):
# 학습데이터는 영화리뷰 본문(doc), 평점(point)으로 구성
counts = defaultdict(lambda : [0, 0])
for doc, point in training_set:
# 영화리뷰가 text일 때만 카운트
if self.isNumber(doc) is False:
# 리뷰를 띄어쓰기 단위로 토크나이징
words = doc.split()
for word in words:
counts[word][0 if point > 3.5 else 1] += 1
return counts
def isNumber(self, s):
try:
float(s)
return True
except ValueError:
return False
def word_probabilities(self, counts, total_class0, total_class1, k):
# 단어의 빈도수를 [단어, p(w|긍정), p(w|부정)] 형태로 반환
return [(w,
(class0 + k) / (total_class0 + 2*k),
(class1 + k) / (total_class1 + 2*k))
for w, (class0, class1) in counts.items()]
def class0_probability(self, word_probs, doc):
# 별도 토크나이즈 안하고 띄어쓰기로만
docwords = doc.split()
# 초기값은 모두 0으로 처리
log_prob_if_class0 = log_prob_if_class1 = 0.0
# 모든 단어에 대해 반복
for word, prob_if_class0, prob_if_class1 in word_probs:
# 만약 리뷰에 word가 나타나면
# 해당 단어가 나올 log 확률을 더해 줌
if word in docwords:
log_prob_if_class0 += math.log(prob_if_class0)
log_prob_if_class1 += math.log(prob_if_class1)
# 만약 리뷰에 word가 나타나지 않는다면
# 해당 단어가 나오지 않을 log 확률을 더해 줌
# 나오지 않을 확률은 log(1-나올 확률)로 계산
else:
log_prob_if_class0 += math.log(1.0 - prob_if_class0)
log_prob_if_class1 += math.log(1.0 - prob_if_class1)
prob_if_class0 = math.exp(log_prob_if_class0)
prob_if_class1 = math.exp(log_prob_if_class1)
return prob_if_class0 / (prob_if_class0 + prob_if_class1)
def train(self, trainfile_path):
training_set = self.load_corpus(trainfile_path)
# 범주0(긍정)과 범주1(부정) 문서 수를 세어 줌
num_class0 = len([1 for _, point in training_set if point > 3.5])
num_class1 = len(training_set) - num_class0
# train
word_counts = self.count_words(training_set)
self.word_probs = self.word_probabilities(word_counts,
num_class0,
num_class1,
self.k)
def classify(self, doc):
return self.class0_probability(self.word_probs, doc)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment