Last active
December 10, 2015 15:23
-
-
Save erantone/34cbabf43f6558404617 to your computer and use it in GitHub Desktop.
Naive Bayes implementation based on the book Machine Learning by Tom M. Mitchell (sentiment analysis)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# - Naive Bayes implementation based on the book Machine Learning by Tom M. Mitchell | |
# - Application to the task of sentinment analysis with text classification using the movie_reviews database from nltk.corpus | |
# Copyright (C) 2015 Eric Aislan Antonelo | |
# This program is free software: you can redistribute it and/or modify | |
# it under the terms of the GNU General Public License as published by | |
# the Free Software Foundation, either version 3 of the License, or | |
# (at your option) any later version. | |
from nltk.corpus import movie_reviews | |
import nltk | |
import random | |
import re | |
import numpy as np | |
documents = [(list(movie_reviews.words(fileid)), category) | |
for category in movie_reviews.categories() | |
for fileid in movie_reviews.fileids(category)] | |
random.shuffle(documents) | |
# 0.92 0.88 | |
## | |
print 'starting..' | |
# calcula as palavras mais frequentes | |
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words()) | |
def filter_words(words): | |
words = filter(lambda x: (len(x) >= 3 and re.search('^\W', x) == None), words) | |
return [word.lower() for word in words] | |
# escolhe como vocabulario as 2000 palavras mais frequentes nos textos | |
# vocabulary = list(all_words)[:2000] | |
vocabulary = filter_words(list(all_words)[100:10000]) | |
documents = [(filter_words(doc), t) for (doc, t) in documents ] | |
train_set, test_set = documents[100:], documents[:100] | |
train_set = documents #documents[100:], documents[:100] | |
targets = ['pos', 'neg'] | |
n_v = len(vocabulary) | |
## | |
print '...' | |
def train_naive_bayes(train_set): | |
P_v = {} | |
P_word = {} | |
docs = {} | |
Text = {} | |
for target in targets: | |
docs[target] = filter(lambda (example): example[1] == target, train_set) | |
P_v[target] = len(docs[target]) / float(len(train_set)) | |
Text[target] = ' '.join([' '.join(doc) for (doc, t) in docs[target]]) | |
Text[target] = Text[target].lower() | |
n = len(set(Text[target].split() )) # len(docs[target]) # | |
P_word[target] = np.zeros(n_v) | |
for k in range(n_v): | |
if np.mod(k, 1000) == 0: | |
print target, k | |
wk = vocabulary[k] | |
nk = Text[target].count(wk) # [(wk in set(doc[0])) for doc in docs[target]] # | |
# nk = len(filter(lambda x: x, nk)) | |
# print 'nk', nk | |
# P_word[target][k] = (nk)/float((n)) | |
P_word[target][k] = (nk + 1)/float((n + n_v)) | |
return (P_v, P_word) | |
print 'Starting training...' | |
P_v, P_word = train_naive_bayes(train_set) | |
print 'Finished training...' | |
## | |
print 'Starting test...' | |
def classify_naive_bayes(doc, P_v, P_word): | |
word_positions = [] | |
for k in range(len(vocabulary)): | |
wk = vocabulary[k] | |
if wk in set(doc): | |
word_positions.append(k) | |
likelihood = {} | |
print word_positions | |
for j in range(len(targets)): | |
target = targets[j] | |
# likelihood[j] = P_v[target] * reduce(lambda x,y: x*y, [P_word[target][k] for k in word_positions] ) | |
likelihood[j] = np.log(P_v[target]) + np.sum([np.log(P_word[target][k]) for k in word_positions]) | |
print likelihood | |
jmax = np.argmax(likelihood.values()) | |
return targets[likelihood.keys()[jmax]] | |
n = len(test_set) | |
n_correct = 0 | |
desired = [] | |
output = [] | |
for j in range(n): | |
t = classify_naive_bayes(test_set[j][0], P_v, P_word) | |
desired.append(test_set[j][1]) | |
output.append(t) | |
if t == test_set[j][1]: | |
print t, test_set[j][1] | |
n_correct += 1 | |
else: | |
print 'err', t, test_set[j][1] | |
print n_correct, n_correct/float(n) | |
## | |
from matplotlib import pyplot as plt | |
from matplotlib.pyplot import plot #, subplot, vlines | |
plot(P_word['neg'], '.') | |
plot(P_word['pos'], 'r*') | |
plt.show() | |
## | |
P_v = {} | |
P_word = {} | |
docs = {} | |
Text = {} | |
n_v = len(vocabulary) | |
nks = {'pos': [], 'neg': []} | |
for target in targets: | |
docs[target] = filter(lambda (example): example[1] == target, train_set) | |
P_v[target] = len(docs[target]) / float(len(train_set)) | |
Text[target] = ' '.join([' '.join(doc) for (doc, t) in docs[target]]) | |
Text[target] = Text[target].lower() | |
n = len(set(Text[target].split() )) | |
P_word[target] = np.zeros(n_v) | |
for k in range(n_v): | |
if np.mod(k, 1000) == 0: | |
print target, k | |
wk = vocabulary[k] | |
nk = Text[target].count(wk) | |
nks[target].append(nk) | |
# P_word[target][k] = (nk)/float((n)) | |
P_word[target][k] = (nk + 1)/float((n+n_v )) | |
## | |
plot(nks['neg'], '.') | |
plot(nks['pos'], 'r*') | |
plt.show() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment