Last active
December 22, 2016 05:13
-
-
Save bis-carbon/cfc8976e6d8b7b62aa58050f5ad682c5 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from math import log | |
import operator | |
import os | |
import math | |
import sys | |
import collections | |
def main(): | |
lines = read_csv() | |
lables = lines[0] | |
lables = lables[0:(len(lables)-1)] | |
values = lines[1:(len(lines))] | |
values = update_decision_column(values) | |
values = update_null(values) | |
decisiontree = tree(values, lables) | |
f = open('tree.txt','w') | |
f.write(str(decisiontree)) | |
f.close() | |
print(decisiontree) | |
def tree(data,labels): | |
classList = [ex[-1] for ex in data] | |
if classList.count(classList[0]) == len(classList): | |
return classList[0] | |
if len(data[0]) == 1: | |
return majority(classList) | |
bestFeat = choose(data) | |
bestFeatLabel = labels[bestFeat] | |
theTree = {bestFeatLabel:{}} | |
del(labels[bestFeat]) | |
featValues = [ex[bestFeat] for ex in data] | |
uniqueVals = set(featValues) | |
for value in uniqueVals: | |
subLabels = labels[:] | |
theTree[bestFeatLabel][value] = tree(split(data, bestFeat, value),subLabels) | |
return theTree | |
def read_csv(): | |
list1 = [] | |
fh = open('delaytraining.csv', 'r') | |
for line in fh: | |
list1.append(line.strip().split(',')) | |
return list1 | |
def update_decision_column(decision1): | |
i=0 | |
for values2 in decision1: | |
int_value = values2[-1] | |
if not values2[-1]: | |
values2[-1]='Diverted' | |
else: | |
int_value = int(values2[-1]) | |
if int_value<0: | |
values2[-1]='Early' | |
elif int_value==0: | |
values2[-1]='On_time' | |
else: | |
values2[-1]='Delayed' | |
decision1[i]=values2 | |
i+=1 | |
return decision1 | |
def update_null(data1): | |
i=0 | |
for values3 in data1: | |
j=0 | |
for x in values3: | |
if not x: | |
values3[j]='0' | |
j+=1 | |
data1[i]=values3 | |
i+=1 | |
return data1 | |
def check_null(value): | |
if not value: | |
return 0 | |
return value | |
def entropy(data): | |
entries = len(data) | |
labels = {} | |
for feat in data: | |
label = feat[-1] | |
if label not in labels.keys(): | |
labels[label] = 0 | |
labels[label] += 1 | |
entropy = 0.0 | |
for key in labels: | |
probability = float(labels[key])/entries | |
entropy -= probability * log(probability,2) | |
return entropy | |
def split(data, axis, val): | |
newData = [] | |
for feat in data: | |
if feat[axis] == val: | |
reducedFeat = feat[:axis] | |
reducedFeat.extend(feat[axis+1:]) | |
newData.append(reducedFeat) | |
return newData | |
def choose(data): | |
features = len(data[0]) - 1 | |
baseEntropy = entropy(data) | |
bestInfoGain = 0.0; | |
bestFeat = -1 | |
for i in range(features): | |
featList = [ex[i] for ex in data] | |
uniqueVals = set(featList) | |
newEntropy = 0.0 | |
for value in uniqueVals: | |
newData = split(data, i, value) | |
probability = len(newData)/float(len(data)) | |
newEntropy += probability * entropy(newData) | |
infoGain = baseEntropy - newEntropy | |
if (infoGain > bestInfoGain): | |
bestInfoGain = infoGain | |
bestFeat = i | |
return bestFeat | |
def majority(classList): | |
classCount={} | |
for vote in classList: | |
if vote not in classCount.keys(): classCount[vote] = 0 | |
classCount[vote] += 1 | |
sortedClassCount = sorted(classCount.iteritems(), key=operator.itemgetter(1), reverse=True) | |
return sortedClassCount[0][0] | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment