Skip to content

Instantly share code, notes, and snippets.

@bis-carbon
Last active December 22, 2016 05:13
Show Gist options
  • Save bis-carbon/cfc8976e6d8b7b62aa58050f5ad682c5 to your computer and use it in GitHub Desktop.
Save bis-carbon/cfc8976e6d8b7b62aa58050f5ad682c5 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
from math import log
import operator
import os
import math
import sys
import collections
def main():
lines = read_csv()
lables = lines[0]
lables = lables[0:(len(lables)-1)]
values = lines[1:(len(lines))]
values = update_decision_column(values)
values = update_null(values)
decisiontree = tree(values, lables)
f = open('tree.txt','w')
f.write(str(decisiontree))
f.close()
print(decisiontree)
def tree(data,labels):
classList = [ex[-1] for ex in data]
if classList.count(classList[0]) == len(classList):
return classList[0]
if len(data[0]) == 1:
return majority(classList)
bestFeat = choose(data)
bestFeatLabel = labels[bestFeat]
theTree = {bestFeatLabel:{}}
del(labels[bestFeat])
featValues = [ex[bestFeat] for ex in data]
uniqueVals = set(featValues)
for value in uniqueVals:
subLabels = labels[:]
theTree[bestFeatLabel][value] = tree(split(data, bestFeat, value),subLabels)
return theTree
def read_csv():
list1 = []
fh = open('delaytraining.csv', 'r')
for line in fh:
list1.append(line.strip().split(','))
return list1
def update_decision_column(decision1):
i=0
for values2 in decision1:
int_value = values2[-1]
if not values2[-1]:
values2[-1]='Diverted'
else:
int_value = int(values2[-1])
if int_value<0:
values2[-1]='Early'
elif int_value==0:
values2[-1]='On_time'
else:
values2[-1]='Delayed'
decision1[i]=values2
i+=1
return decision1
def update_null(data1):
i=0
for values3 in data1:
j=0
for x in values3:
if not x:
values3[j]='0'
j+=1
data1[i]=values3
i+=1
return data1
def check_null(value):
if not value:
return 0
return value
def entropy(data):
entries = len(data)
labels = {}
for feat in data:
label = feat[-1]
if label not in labels.keys():
labels[label] = 0
labels[label] += 1
entropy = 0.0
for key in labels:
probability = float(labels[key])/entries
entropy -= probability * log(probability,2)
return entropy
def split(data, axis, val):
newData = []
for feat in data:
if feat[axis] == val:
reducedFeat = feat[:axis]
reducedFeat.extend(feat[axis+1:])
newData.append(reducedFeat)
return newData
def choose(data):
features = len(data[0]) - 1
baseEntropy = entropy(data)
bestInfoGain = 0.0;
bestFeat = -1
for i in range(features):
featList = [ex[i] for ex in data]
uniqueVals = set(featList)
newEntropy = 0.0
for value in uniqueVals:
newData = split(data, i, value)
probability = len(newData)/float(len(data))
newEntropy += probability * entropy(newData)
infoGain = baseEntropy - newEntropy
if (infoGain > bestInfoGain):
bestInfoGain = infoGain
bestFeat = i
return bestFeat
def majority(classList):
classCount={}
for vote in classList:
if vote not in classCount.keys(): classCount[vote] = 0
classCount[vote] += 1
sortedClassCount = sorted(classCount.iteritems(), key=operator.itemgetter(1), reverse=True)
return sortedClassCount[0][0]
main()
Display the source blob
Display the rendered blob
Raw
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment