Created
June 16, 2016 14:08
-
-
Save i5on9i/dcf7be5e60eeccdda2e1a3cf571659a1 to your computer and use it in GitHub Desktop.
Summary: A Naive Bayes classifier Home-page: http://www.cbs.dtu.dk Author: Kasper Jensen Author-email: kasjens@cbs.dtu.dk
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from math import exp, pi | |
class NaiveBayes: | |
def __init__(self): | |
self.labels = [] | |
self.stat_labels = {} | |
self.numof_instances = 0 | |
self.attributes = [] | |
self.attvals = {} | |
self.real_stat = {} | |
self.stat_attributes = {} | |
self.smoothing = {} | |
self.attribute_type = {} | |
self.gaussian = 1 / ((2 * pi) ** (1 / 2)) | |
def set_real(self, attributes): | |
for attribute in attributes: | |
self.attribute_type[attribute] = 'real' | |
def set_smoothing(self, attributes): | |
for attribute in attributes.keys(): | |
self.smoothing[attribute] = attributes[attribute] | |
def drop_attributes(self, attributes): | |
for attribute in attributes: | |
del self.attvals[attribute] | |
del self.stat_attributes[attribute] | |
del self.attribute_type[attribute] | |
if attribute in self.real_stat.keys(): | |
del self.real_stat[attribute] | |
if attribute in self.smoothing.keys(): | |
del self.smoothing[attribute] | |
new_attributes = [] | |
for attribute in self.attributes: | |
if attribute not in attributes: | |
new_attributes.append(attribute) | |
self.attributes = new_attributes | |
def add_instances(self, params): | |
if not 'attributes' in params.keys() or not 'label' in params.keys() or not 'cases' in params.keys(): | |
raise Exception('Missing instance parameters') | |
if len(self.stat_attributes.keys()) == 0: | |
for attribute in params['attributes'].keys(): | |
self.stat_attributes[attribute] = {} | |
self.attributes.append(attribute) | |
self.attvals[attribute] = [] | |
if not attribute in self.attribute_type.keys(): | |
self.attribute_type[attribute] = 'nominal' | |
else: | |
for attribute in self.attribute_type.keys(): | |
if not attribute in params['attributes'].keys(): | |
raise Exception('Attribute not given in instance: ' + attribute) | |
self.numof_instances += params['cases'] | |
if not params['label'] in self.stat_labels.keys(): | |
self.labels.append(params['label']) | |
self.stat_labels[params['label']] = 0 | |
self.stat_labels[params['label']] += params['cases'] | |
for attribute in self.stat_attributes.keys(): | |
if not attribute in params['attributes'].keys(): | |
raise Exception('Attribute ' + attribute + ' not given') | |
attval = params['attributes'][attribute] | |
if not attval in self.stat_attributes[attribute].keys(): | |
self.attvals[attribute].append(attval) | |
self.stat_attributes[attribute][attval] = {} | |
if not params['label'] in self.stat_attributes[attribute][attval].keys(): | |
self.stat_attributes[attribute][attval][params['label']] = 0 | |
self.stat_attributes[attribute][attval][params['label']] += params['cases'] | |
def train(self): | |
self.model = {'lprob' : {}, 'cprob' : {}, 'real_stat' : {}} | |
for label in self.stat_labels.keys(): | |
self.model['lprob'][label] = self.stat_labels[label] / self.numof_instances | |
for attribute in self.stat_attributes.keys(): | |
if not self.attribute_type[attribute] == 'real': | |
self.model['cprob'][attribute] = {} | |
for label in self.stat_labels.keys(): | |
total = 0 | |
attvals = [] | |
for attval in self.stat_attributes[attribute].keys(): | |
if label in self.stat_attributes[attribute][attval].keys() and self.stat_attributes[attribute][attval][label] > 0: | |
attvals.append(attval) | |
if not attval in self.model['cprob'][attribute].keys(): | |
self.model['cprob'][attribute][attval] = {} | |
self.model['cprob'][attribute][attval][label] = self.stat_attributes[attribute][attval][label] | |
total += self.model['cprob'][attribute][attval][label] | |
if attribute in self.smoothing.keys(): | |
uc = self.smoothing[attribute] | |
if uc <= 0: | |
uc = 0.5 | |
if not '*' in self.model['cprob'][attribute].keys(): | |
self.model['cprob'][attribute]['*'] = {} | |
self.model['cprob'][attribute]['*'][label] = uc; | |
total += uc | |
if '*' in attvals: | |
raise Exception("'*' as attribute value has been reserved") | |
attvals.append('*') | |
for attval in attvals: | |
self.model['cprob'][attribute][attval][label] /= total | |
else: | |
if attribute in self.smoothing.keys(): | |
raise Exception('Smoothing has been set for real attribute ' + attribute) | |
self.model['real_stat'][attribute] = {} | |
for attval in self.stat_attributes[attribute].keys(): | |
for label in self.stat_attributes[attribute][attval]: | |
if not label in self.model['real_stat'][attribute]: | |
self.model['real_stat'][attribute][label] = {'sum' : 0, 'count' : 0, 'mean' : 0, 'sigma' : 0} | |
self.model['real_stat'][attribute][label]['sum'] += float(attval) * self.stat_attributes[attribute][attval][label] | |
self.model['real_stat'][attribute][label]['count'] += self.stat_attributes[attribute][attval][label] | |
if self.model['real_stat'][attribute][label]['count'] > 0: | |
self.model['real_stat'][attribute][label]['mean'] = self.model['real_stat'][attribute][label]['sum'] / self.model['real_stat'][attribute][label]['count'] | |
for attval in self.stat_attributes[attribute].keys(): | |
for label in self.stat_attributes[attribute][attval]: | |
self.model['real_stat'][attribute][label]['sigma'] += (float(attval) - self.model['real_stat'][attribute][label]['mean']) ** 2 * self.stat_attributes[attribute][attval][label] | |
for label in self.model['real_stat'][attribute]: | |
self.model['real_stat'][attribute][label]['sigma'] = (self.model['real_stat'][attribute][label]['sigma'] / (self.model['real_stat'][attribute][label]['count'] - 1)) ** (1 / 2) | |
def predict(self, params): | |
if not 'attributes' in params.keys(): | |
raise Exception('Missing attributes parameter') | |
scores = {} | |
nsum = 0 | |
nscores = {} | |
for label in self.labels: | |
scores[label] = self.model['lprob'][label] | |
for attribute in params['attributes'].keys(): | |
if not attribute in self.attribute_type: | |
raise Exception('Unknown attribute ' + attribute) | |
if not self.attribute_type[attribute] == 'real': | |
attval = params['attributes'][attribute] | |
if not attval in self.stat_attributes[attribute] and not attribute in self.smoothing.keys(): | |
raise Exception('Attribute value ' + attval + ' not defined') | |
for label in self.labels: | |
if attval in self.model['cprob'][attribute] and label in self.model['cprob'][attribute][attval] and self.model['cprob'][attribute][attval][label] > 0: | |
scores[label] *= self.model['cprob'][attribute][attval][label] | |
elif attribute in self.smoothing.keys(): | |
scores[label] *= self.model['cprob'][attribute]['*'][label] | |
else: | |
scores[label] = 0 | |
else: | |
for label in self.labels: | |
nscores[label] = self.gaussian / self.model['real_stat'][attribute][label]['sigma'] * exp(-0.5 * ((float(params['attributes'][attribute]) - self.model['real_stat'][attribute][label]['mean']) / self.model['real_stat'][attribute][label]['sigma']) ** 2) | |
nsum += nscores[label] | |
if not nsum == 0: | |
for label in self.labels: | |
scores[label] *= nscores[label] | |
sumPx = 0 | |
for label in scores.keys(): | |
sumPx += scores[label] | |
for label in scores.keys(): | |
scores[label] /= sumPx | |
return(scores) | |
def preferredLabel(self, scores): | |
maxValue = None | |
prefLabel = None | |
for label in scores.keys(): | |
if maxValue == None: | |
maxValue = scores[label] | |
prefLabel = label | |
elif scores[label] > maxValue: | |
maxValue = scores[label] | |
prefLabel = label | |
return(prefLabel) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment