Yuri-M-Dias/DataMiningAlgorithms.py

## DataMiningAlgorithms.py
#!/usr/bin/env python
import math
import operator
from sklearn import datasets
from sklearn import svm
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap

class DataMiningStrategy(object):
    """An absctract method using duck typing"""
    def learnFromInput(self, elements, compareTo):
        raise NotImplementedError("Implement the learn from input!")

class NNearestImpl(DataMiningStrategy):
    """N Nearest algorithm used, in here, only the 5 Nearest"""
    def learnFromInput(self, elements, compareTo):
        a1 = float(compareTo.split("|")[0])
        a2 = float(compareTo.split("|")[1])
        trainingClass = elements[-1]#the class instruction
        distances = []
        for line in range(0, trainingClass.getNumberElements()):#all of the avaliable elements
            b1 = float(elements[0].getElements()[line])
            b2 = float(elements[1].getElements()[line])
            distance = self.calculateDistance(a1, a2, b1, b2)
            distances.append(DistanceHelper(distance, a1, a2, b1, b2, trainingClass.getElements()[line]))
            pass
        everyTimeElementAppeared = [0 for x in range(0,5)]
        possibleelements = []
        distances.sort(key=operator.attrgetter('distance'))#sorts the class by the atribuite distance
        for x in range(0, 5):#Only the five first
            print(distances[x])
            if str(distances[x].getClassName) in possibleelements:
                everyTimeElementAppeared[possibleelements.index(str(distances[x].getClassName))] += 1
            else:
                possibleelements.append(str(distances[x].getClassName))
                everyTimeElementAppeared[possibleelements.index(str(distances[x].getClassName))] += 1
            pass
        #print(possibleelements)
        return possibleelements

    """Calculates the distance between two points"""
    def calculateDistance(self, a1, a2, b1, b2):
        distance = math.sqrt((a1 - b1)**2 + (a2 - b2)**2)#for a bi-dimensional distance
        #I made the function so that it can be altered later for a multi-dimensional implementation
        return distance

class NaiveBayesImpl(DataMiningStrategy):
    """Calculates the prediction using the Naive Bayes, from the training set and returning already the name of the prediction"""
    def learnFromInput(self, elements, compareTo):
        bayesProbability = []
        possibleelements = elements[4].getPossibleElements()
        for element in range(0,elements[4].getNumberPossibleElements()):#for each of the possible classes
            priorprobability = elements[4].calculateEachPossibleElemProb()[element]#P(Ci)
            print(priorprobability)
            for x in range(0,4):#all of the avaliable elements
                # pi of P(a1 = v1 | Ci)...
                priorprobability *= self.calculateConditionalProbability(elements[x], elements[4], str(compareTo[x]), str(possibleelements[element]))
                pass
            print(priorprobability, "2")
            bayesProbability.append(priorprobability)
            pass
        for x in bayesProbability:
            print(x)
            pass
        print(possibleelements[bayesProbability.index(max(bayesProbability))])
        return possibleelements[bayesProbability.index(max(bayesProbability))]

    """calculates the conditional probability, if there's two itens in the same line of input"""
    def calculateConditionalProbability(self, numelem1, numelem2, compare1, compare2):
        probabilityofTwo = 0
        for x in range(0,20):
            if numelem1.getElements()[x] == str(compare1) and numelem2.getElements()[x] == str(compare2):
                probabilityofTwo += 1
                pass
            pass
        probability = probabilityofTwo/numelem2.getTimesElementAppeared()[numelem2.getPossibleElements().index(str(compare2))]
        return probability

class ElementToCalculate(object):
    """The class for each element that'll be counted"""
    def __init__(self, name):
        self.name = name
        self.numberOfPossibleElements = 0
        self.possibleelements = []
        self.numberofelements = 0
        self.elements = []
        self.everyTimeElementAppeared = [0 for x in range(0,50)]
        pass

    def getName(self):
        return str(self.name)

    def getElements(self):
        return self.elements

    def getNumberElements(self):
        return int(self.numberofelements)

    def getPossibleElements(self):
        return self.possibleelements

    def getNumberPossibleElements(self):
        return int(self.numberOfPossibleElements)

    def getTimesElementAppeared(self):
        return self.everyTimeElementAppeared

    def addElement(self, elementname):
        if elementname in self.possibleelements:#If the item is already in the possible number of elements
            self.elements.append(str(elementname))
            self.numberofelements += 1
            self.everyTimeElementAppeared[self.possibleelements.index(str(elementname))] += 1
        else:#Unknow item, add it to the possible ones
            self.possibleelements.append(str(elementname))
            self.numberOfPossibleElements += 1
            self.elements.append(str(elementname))
            self.numberofelements += 1
            self.everyTimeElementAppeared[self.possibleelements.index(str(elementname))] += 1
            pass
        pass
    """simple probability calculation, number of times it appeared"""
    def calculateEachPossibleElemProb(self):
        probabilites = []
        for pelement in range(0, self.numberOfPossibleElements):
            number = self.everyTimeElementAppeared[pelement]/self.numberofelements
            probabilites.append(number)
            pass
        return probabilites

class DistanceHelper(object):
    """Just one object made to help with the the sorting and priting of the
    Nth Nearest algoritm"""
    def __init__(self, distance, a1, a2, b1, b2, bClass):
        super(DistanceHelper, self).__init__()
        self.distance = distance
        self.a1 = a1
        self.a2 = a2
        self.b1 = b1
        self.b2 = b2
        self.bClass = str(bClass)

    def __repr__(self):
        return ('Distance: '+str(self.distance)+' from '+str(self.a1)+', '+str(self.a2)+' and '+str(self.b1)+', '+str(self.b2)+' and with class: '+self.bClass)

    def getClassName(self):
        return str(self.bClass)


#-----Helper functions: print and read from the training set-----

def printAllElements(elements):
    """prints all the elements in the training set"""
    for line in elements:
        print(line.getName(), 'with', line.getNumberElements(), 'elements')
        for elems in line.getElements():
            print(elems)
            pass
        print('And possible elements:')
        probabilites = line.calculateEachPossibleElemProb()
        possibleelements = line.getPossibleElements()
        for x in range(0,line.getNumberPossibleElements()):
            print(possibleelements[x], 'with probability of:', probabilites[x])
            pass
        pass
    pass


def readFileTokenizer(name):
    """reads from a file, splitig the input by lines and by the '|' operator"""
    file = open(name+'.txt', 'r')
    lines = [line.rstrip('\n').split('|') for line in file]
    file.close()
    return lines

def main():
    lines = readFileTokenizer(str(input('Digite o nome do arquivo de treinamento\n')))
    elements = [ElementToCalculate(elementsnames) for elementsnames in lines[0]]
    lines.remove(lines[0])
    for line in lines:
        for x in range(0,len(elements)):
            elements[x].addElement(line[x])
            pass
        pass
    dataMiningAlgorithm = DataMiningStrategy()
    choice = int(input("Type 0 for Nth Nearest, or 1 for Naive Bayes\n"))
    if(choice == 0):
        dataMiningAlgoritm = NNearestImpl()
    elif(choice == 1):
        dataMiningAlgoritm = NaiveBayesImpl()
    else:
         print("You didn't type 0 or 1, you liar! Run it again!")
         exit
         pass
    dataMiningAlgoritm.learnFromInput(elements, str(input("Type the data to be predicted/learned from, like '9.1|11.0' or 'weekday|winter|high|heavy'\n")))
    #input("Press enter to continue...")#just to use it in single-time windows settings

#main()

mydata = pd.read_csv('trainingNear2.csv')
target = mydata["Class"]
#iris = datasets.load_iris()
#data = iris.data[:,:2]
#target = iris.target
data = mydata.ix[:,:-1]
#print(data)

data_train = data[:-5]
target_train = target[:-5]
data_test  = data[-5:]
target_test  = target[-5:]

print(data_train)
print(target_train)

#The Gaussian Naive Bayesgnb = GaussianNB()
gnb.fit(data_train, target_train)
print("Prediction with GaussianNB: %s" % (gnb.predict(data_test)))

#Kn-Neighbors
knn = KNeighborsClassifier(weights='distance', algorithm='auto')
knn.fit(data_train, target_train)
print("Prediction with knn: %s" % knn.predict(data_test))
print(target_test)
data_test = np.array(data_test)
target_test = np.array(target_test)

## trainingBayes.csv

          
            Day
            Season
            Wind
            Rain
            Class

            
              weekday
              spring
              none
              none
              on time

            
              weekday
              winter
              none
              slight
              on time

            
              weekday
              winter
              none
              slight
              on time

            
              weekday
              winter
              high
              heavy
              late

            
              saturday
              summer
              normal
              none
              on time

            
              weekday
              autumn
              normal
              none
              very late

            
              holiday
              summer
              high
              slight
              on time

            
              sunday
              summer
              normal
              none
              on time

            
              weekday
              winter
              high
              heavy
              very late

            
              weekday
              summer
              none
              slight
              on time

            
              saturday
              spring
              high
              heavy
              cancelled

            
              weekday
              summer
              high
              slight
              on time

            
              saturday
              winter
              normal
              none
              late

            
              weekday
              summer
              high
              none
              on time

            
              weekday
              winter
              normal
              heavy
              very late

            
              saturday
              autumn
              high
              slight
              on time

            
              weekday
              autumn
              none
              heavy
              on time

            
              holiday
              spring
              normal
              slight
              on time

            
              weekday
              spring
              normal
              none
              on time

            
              weekday
              spring
              normal
              slight
              on time

## trainingNNearest.csv

          
            Attribute1
            Attribute2
            Class

            
              0.8
              6.3
              0

            
              1.4
              8.1
              0

            
              2.1
              7.4
              0

            
              2.6
              14.3
              1

            
              6.8
              12.6
              0

            
              8.8
              9.8
              1

            
              9.2
              11.6
              0

            
              10.8
              9.6
              1

            
              11.8
              9.9
              1

            
              12.4
              6.5
              1

            
              12.8
              1.1
              0

            
              14.0
              19.9
              0

            
              14.2
              18.5
              0

            
              15.6
              17.4
              0

            
              15.8
              12.2
              0

            
              16.6
              6.7
              1

            
              17.4
              4.5
              1

            
              18.2
              6.9
              1

            
              19.0
              3.4
              0

            
              19.6
              11.1
              1
	#!/usr/bin/env python
	import math
	import operator
	from sklearn import datasets
	from sklearn import svm
	import numpy as np
	import pandas as pd
	from sklearn.neighbors import KNeighborsClassifier
	from sklearn.naive_bayes import GaussianNB
	import matplotlib.pyplot as plt
	from matplotlib.colors import ListedColormap

	class DataMiningStrategy(object):
	"""An absctract method using duck typing"""
	def learnFromInput(self, elements, compareTo):
	raise NotImplementedError("Implement the learn from input!")

	class NNearestImpl(DataMiningStrategy):
	"""N Nearest algorithm used, in here, only the 5 Nearest"""
	def learnFromInput(self, elements, compareTo):
	a1 = float(compareTo.split("\|")[0])
	a2 = float(compareTo.split("\|")[1])
	trainingClass = elements[-1]#the class instruction
	distances = []
	for line in range(0, trainingClass.getNumberElements()):#all of the avaliable elements
	b1 = float(elements[0].getElements()[line])
	b2 = float(elements[1].getElements()[line])
	distance = self.calculateDistance(a1, a2, b1, b2)
	distances.append(DistanceHelper(distance, a1, a2, b1, b2, trainingClass.getElements()[line]))
	pass
	everyTimeElementAppeared = [0 for x in range(0,5)]
	possibleelements = []
	distances.sort(key=operator.attrgetter('distance'))#sorts the class by the atribuite distance
	for x in range(0, 5):#Only the five first
	print(distances[x])
	if str(distances[x].getClassName) in possibleelements:
	everyTimeElementAppeared[possibleelements.index(str(distances[x].getClassName))] += 1
	else:
	possibleelements.append(str(distances[x].getClassName))
	everyTimeElementAppeared[possibleelements.index(str(distances[x].getClassName))] += 1
	pass
	#print(possibleelements)
	return possibleelements

	"""Calculates the distance between two points"""
	def calculateDistance(self, a1, a2, b1, b2):
	distance = math.sqrt((a1 - b1)2 + (a2 - b2)2)#for a bi-dimensional distance
	#I made the function so that it can be altered later for a multi-dimensional implementation
	return distance

	class NaiveBayesImpl(DataMiningStrategy):
	"""Calculates the prediction using the Naive Bayes, from the training set and returning already the name of the prediction"""
	def learnFromInput(self, elements, compareTo):
	bayesProbability = []
	possibleelements = elements[4].getPossibleElements()
	for element in range(0,elements[4].getNumberPossibleElements()):#for each of the possible classes
	priorprobability = elements[4].calculateEachPossibleElemProb()[element]#P(Ci)
	print(priorprobability)
	for x in range(0,4):#all of the avaliable elements
	# pi of P(a1 = v1 \| Ci)...
	priorprobability *= self.calculateConditionalProbability(elements[x], elements[4], str(compareTo[x]), str(possibleelements[element]))
	pass
	print(priorprobability, "2")
	bayesProbability.append(priorprobability)
	pass
	for x in bayesProbability:
	print(x)
	pass
	print(possibleelements[bayesProbability.index(max(bayesProbability))])
	return possibleelements[bayesProbability.index(max(bayesProbability))]

	"""calculates the conditional probability, if there's two itens in the same line of input"""
	def calculateConditionalProbability(self, numelem1, numelem2, compare1, compare2):
	probabilityofTwo = 0
	for x in range(0,20):
	if numelem1.getElements()[x] == str(compare1) and numelem2.getElements()[x] == str(compare2):
	probabilityofTwo += 1
	pass
	pass
	probability = probabilityofTwo/numelem2.getTimesElementAppeared()[numelem2.getPossibleElements().index(str(compare2))]
	return probability

	class ElementToCalculate(object):
	"""The class for each element that'll be counted"""
	def __init__(self, name):
	self.name = name
	self.numberOfPossibleElements = 0
	self.possibleelements = []
	self.numberofelements = 0
	self.elements = []
	self.everyTimeElementAppeared = [0 for x in range(0,50)]
	pass

	def getName(self):
	return str(self.name)

	def getElements(self):
	return self.elements

	def getNumberElements(self):
	return int(self.numberofelements)

	def getPossibleElements(self):
	return self.possibleelements

	def getNumberPossibleElements(self):
	return int(self.numberOfPossibleElements)

	def getTimesElementAppeared(self):
	return self.everyTimeElementAppeared

	def addElement(self, elementname):
	if elementname in self.possibleelements:#If the item is already in the possible number of elements
	self.elements.append(str(elementname))
	self.numberofelements += 1
	self.everyTimeElementAppeared[self.possibleelements.index(str(elementname))] += 1
	else:#Unknow item, add it to the possible ones
	self.possibleelements.append(str(elementname))
	self.numberOfPossibleElements += 1
	self.elements.append(str(elementname))
	self.numberofelements += 1
	self.everyTimeElementAppeared[self.possibleelements.index(str(elementname))] += 1
	pass
	pass
	"""simple probability calculation, number of times it appeared"""
	def calculateEachPossibleElemProb(self):
	probabilites = []
	for pelement in range(0, self.numberOfPossibleElements):
	number = self.everyTimeElementAppeared[pelement]/self.numberofelements
	probabilites.append(number)
	pass
	return probabilites

	class DistanceHelper(object):
	"""Just one object made to help with the the sorting and priting of the
	Nth Nearest algoritm"""
	def __init__(self, distance, a1, a2, b1, b2, bClass):
	super(DistanceHelper, self).__init__()
	self.distance = distance
	self.a1 = a1
	self.a2 = a2
	self.b1 = b1
	self.b2 = b2
	self.bClass = str(bClass)

	def __repr__(self):
	return ('Distance: '+str(self.distance)+' from '+str(self.a1)+', '+str(self.a2)+' and '+str(self.b1)+', '+str(self.b2)+' and with class: '+self.bClass)

	def getClassName(self):
	return str(self.bClass)


	#-----Helper functions: print and read from the training set-----

	def printAllElements(elements):
	"""prints all the elements in the training set"""
	for line in elements:
	print(line.getName(), 'with', line.getNumberElements(), 'elements')
	for elems in line.getElements():
	print(elems)
	pass
	print('And possible elements:')
	probabilites = line.calculateEachPossibleElemProb()
	possibleelements = line.getPossibleElements()
	for x in range(0,line.getNumberPossibleElements()):
	print(possibleelements[x], 'with probability of:', probabilites[x])
	pass
	pass
	pass


	def readFileTokenizer(name):
	"""reads from a file, splitig the input by lines and by the '\|' operator"""
	file = open(name+'.txt', 'r')
	lines = [line.rstrip('\n').split('\|') for line in file]
	file.close()
	return lines

	def main():
	lines = readFileTokenizer(str(input('Digite o nome do arquivo de treinamento\n')))
	elements = [ElementToCalculate(elementsnames) for elementsnames in lines[0]]
	lines.remove(lines[0])
	for line in lines:
	for x in range(0,len(elements)):
	elements[x].addElement(line[x])
	pass
	pass
	dataMiningAlgorithm = DataMiningStrategy()
	choice = int(input("Type 0 for Nth Nearest, or 1 for Naive Bayes\n"))
	if(choice == 0):
	dataMiningAlgoritm = NNearestImpl()
	elif(choice == 1):
	dataMiningAlgoritm = NaiveBayesImpl()
	else:
	print("You didn't type 0 or 1, you liar! Run it again!")
	exit
	pass
	dataMiningAlgoritm.learnFromInput(elements, str(input("Type the data to be predicted/learned from, like '9.1\|11.0' or 'weekday\|winter\|high\|heavy'\n")))
	#input("Press enter to continue...")#just to use it in single-time windows settings

	#main()

	mydata = pd.read_csv('trainingNear2.csv')
	target = mydata["Class"]
	#iris = datasets.load_iris()
	#data = iris.data[:,:2]
	#target = iris.target
	data = mydata.ix[:,:-1]
	#print(data)

	data_train = data[:-5]
	target_train = target[:-5]
	data_test = data[-5:]
	target_test = target[-5:]

	print(data_train)
	print(target_train)

	#The Gaussian Naive Bayesgnb = GaussianNB()
	gnb.fit(data_train, target_train)
	print("Prediction with GaussianNB: %s" % (gnb.predict(data_test)))

	#Kn-Neighbors
	knn = KNeighborsClassifier(weights='distance', algorithm='auto')
	knn.fit(data_train, target_train)
	print("Prediction with knn: %s" % knn.predict(data_test))
	print(target_test)
	data_test = np.array(data_test)
	target_test = np.array(target_test)
Day	Season	Wind	Rain	Class
weekday	spring	none	none	on time
weekday	winter	none	slight	on time
weekday	winter	none	slight	on time
weekday	winter	high	heavy	late
saturday	summer	normal	none	on time
weekday	autumn	normal	none	very late
holiday	summer	high	slight	on time
sunday	summer	normal	none	on time
weekday	winter	high	heavy	very late
weekday	summer	none	slight	on time
saturday	spring	high	heavy	cancelled
weekday	summer	high	slight	on time
saturday	winter	normal	none	late
weekday	summer	high	none	on time
weekday	winter	normal	heavy	very late
saturday	autumn	high	slight	on time
weekday	autumn	none	heavy	on time
holiday	spring	normal	slight	on time
weekday	spring	normal	none	on time
weekday	spring	normal	slight	on time
Attribute1	Attribute2	Class
0.8	6.3	0
1.4	8.1	0
2.1	7.4	0
2.6	14.3	1
6.8	12.6	0
8.8	9.8	1
9.2	11.6	0
10.8	9.6	1
11.8	9.9	1
12.4	6.5	1
12.8	1.1	0
14.0	19.9	0
14.2	18.5	0
15.6	17.4	0
15.8	12.2	0
16.6	6.7	1
17.4	4.5	1
18.2	6.9	1
19.0	3.4	0
19.6	11.1	1