Last active
June 25, 2023 16:03
-
-
Save tuttelikz/94f750ef3bf14f8a126a to your computer and use it in GitHub Desktop.
Naive Bayes From Scratch in Python
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#How To Implement Naive Bayes From Scratch in Python | |
#http://machinelearningmastery.com/naive-bayes-classifier-scratch-python/ | |
#Dataset | |
#https://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data | |
import csv | |
import math | |
import random | |
#Handle data | |
def loadCsv(filename): | |
lines = csv.reader(open(filename, "r")) | |
dataset = list(lines) | |
for i in range(len(dataset)): | |
dataset[i] = [float(x) for x in dataset[i]] | |
return dataset | |
#Test handling data | |
""" | |
filename = 'pima-indians-diabetes.data.csv' | |
SomeDataset = loadCsv(filename) | |
print("Loaded data file {0:s} with {1:5d} rows".format(filename,len(SomeDataset))) | |
""" | |
#Split dataset with ratio | |
def splitDataset(dataset, splitRatio): | |
trainSize = int(len(dataset) * splitRatio) | |
trainSet = [] | |
copy = list(dataset) | |
while len(trainSet) < trainSize: | |
index = random.randrange(len(copy)) | |
trainSet.append(copy.pop(index)) | |
return [trainSet, copy] | |
#Test splitting data | |
""" | |
dataset = [[1], [2], [3], [4], [5]] | |
splitRatio = 0.67 | |
train, test = splitDataset(dataset, splitRatio) | |
print('Split {0} rows into train with {1} and test with {2}'.format(len(dataset),train,test)) | |
""" | |
#Separate by Class | |
def separateByClass(dataset): | |
separated = {} | |
for i in range(len(dataset)): | |
vector = dataset[i] | |
if (vector[-1] not in separated): | |
separated[vector[-1]] = [] | |
separated[vector[-1]].append(vector) | |
return separated | |
#Test separating by class | |
""" | |
dataset = [[1,20,1],[2,21,0],[3,22,1]] | |
separated = separateByClass(dataset) | |
print('Separated instances: {0}'.format(separated)) | |
""" | |
#Calculate Mean | |
def mean(numbers): | |
return sum(numbers)/float(len(numbers)) | |
def stdev(numbers): | |
avg = mean(numbers) | |
variance = sum([pow(x-avg,2) for x in numbers])/float(len(numbers)-1) | |
return math.sqrt(variance) | |
#Test stdev & mean calculation | |
""" | |
numbers = [1,2,3,4,5] | |
print('Summary of {0}: mean={1}, stdev={2}'.format(numbers, mean(numbers), stdev(numbers))) | |
""" | |
#Summarize Dataset | |
def summarize(dataset): | |
summaries = [(mean(attribute), stdev(attribute)) for attribute in zip(*dataset)] | |
del summaries[-1] | |
return summaries | |
#Test summarizing data | |
""" | |
dataset = [[1,20,0], [2,21,1], [3,22,0]] | |
summary = summarize(dataset) | |
print('Attribute summaries: {0}'.format(summary)) | |
""" | |
#Summarize attributes by class | |
def summarizeByClass(dataset): | |
separated = separateByClass(dataset) | |
summaries = {} | |
for classValue, instances in separated.items(): | |
summaries[classValue] = summarize(instances) | |
return summaries | |
#Test summarizing attributes | |
""" | |
dataset = [[1,20,1], [2,21,0], [3,22,1], [4,22,0]] | |
summary = summarizeByClass(dataset) | |
print('Summary by class value: {0}'.format(summary)) | |
""" | |
#Calculate Gaussian Probability Density Function | |
def calculateProbability(x, mean, stdev): | |
exponent = math.exp(-(math.pow(x-mean,2)/(2*math.pow(stdev,2)))) | |
return (1/(math.sqrt(2*math.pi)*stdev))*exponent | |
#Testing Gaussing PDF | |
""" | |
x = 71.5 | |
mean = 73 | |
stdev = 6.2 | |
probability = calculateProbability(x,mean,stdev) | |
print('Probability of belonging to this class: {0}'.format(probability)) | |
""" | |
#Calculate Class Probabilities | |
def calculateClassProbabilities(summaries, inputVector): | |
probabilities = {} | |
for classValue, classSummaries in summaries.items(): | |
probabilities[classValue] = 1 | |
for i in range(len(classSummaries)): | |
mean, stdev = classSummaries[i] | |
x = inputVector[i] | |
probabilities[classValue] *= calculateProbability(x, mean, stdev) | |
return probabilities | |
#Testing Class Probability calculation | |
""" | |
summaries = {0:[(1, 0.5)], 1:[(20, 5.0)]} | |
inputVector = [1.1, '?'] | |
probabilities = calculateClassProbabilities(summaries, inputVector) | |
print('Probabilities for each class: {0}'.format(probabilities)) | |
""" | |
#Make a prediction | |
def predict(summaries, inputVector): | |
probabilities = calculateClassProbabilities(summaries, inputVector) | |
bestLabel, bestProb = None, -1 | |
for classValue, probability in probabilities.items(): | |
if bestLabel is None or probability > bestProb: | |
bestProb = probability | |
bestLabel = classValue | |
return bestLabel | |
#Test prediction | |
""" | |
summaries = {'A':[(1, 0.5)], 'B':[(20, 5.0)]} | |
inputVector = [1.1, '?'] | |
result = predict(summaries, inputVector) | |
print('Prediction: {0}'.format(result)) | |
""" | |
#Get predictions | |
def getPredictions(summaries, testSet): | |
predictions = [] | |
for i in range(len(testSet)): | |
result = predict(summaries, testSet[i]) | |
predictions.append(result) | |
return predictions | |
#Test predictions | |
""" | |
summaries = {'A':[(1, 0.5)], 'B':[(20, 5.0)]} | |
testSet = [[1.1,'?'], [19.1,'?']] | |
predictions = getPredictions(summaries, testSet) | |
print('Predictions: {0}',format(predictions)) | |
""" | |
#Get Accuracy | |
def getAccuracy(testSet, predictions): | |
correct = 0 | |
for x in range(len(testSet)): | |
if testSet[x][-1] == predictions[x]: | |
correct += 1 | |
return (correct/float(len(testSet)))*100.0 | |
#Test accuracy | |
""" | |
testSet = [[1,1,1,'a'], [2,2,2,'a'], [3,3,3,'b']] | |
predictions = ['a', 'a', 'a'] | |
accuracy = getAccuracy(testSet, predictions) | |
print('Accuracy: {0}'.format(accuracy)) | |
""" | |
def main(): | |
filename = 'pima-indians-diabetes.data.csv' | |
splitRatio = 0.67 | |
dataset = loadCsv(filename) | |
trainingSet, testSet = splitDataset(dataset, splitRatio) | |
print('Split {0} rows into train = {1} and test = {2} rows'.format(len(dataset),len(trainingSet),len(testSet))) | |
#prepare model | |
summaries = summarizeByClass(trainingSet) | |
#test model | |
predictions = getPredictions(summaries, testSet) | |
accuracy = getAccuracy(testSet, predictions) | |
print('Accuracy: {0}%'.format(accuracy)) | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
In the function :calculateClassProbabilities, we are getting probabilities of only 1 class. Is that correct?