Skip to content

Instantly share code, notes, and snippets.

@danielmoralesp
Last active June 23, 2020 18:02
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save danielmoralesp/185a248df9a9a6ee93cd4fc70c71cb9f to your computer and use it in GitHub Desktop.
Save danielmoralesp/185a248df9a9a6ee93cd4fc70c71cb9f to your computer and use it in GitHub Desktop.
import operator
from collections import Counter
import random
import numpy as np
np.random.seed(1)
random.seed(1)
def split(dataset, labels, column):
data_subsets = []
label_subsets = []
counts = list(set([data[column] for data in dataset]))
counts.sort()
for k in counts:
new_data_subset = []
new_label_subset = []
for i in range(len(dataset)):
if dataset[i][column] == k:
new_data_subset.append(dataset[i])
new_label_subset.append(labels[i])
data_subsets.append(new_data_subset)
label_subsets.append(new_label_subset)
return data_subsets, label_subsets
def gini(dataset):
impurity = 1
label_counts = Counter(dataset)
for label in label_counts:
prob_of_label = label_counts[label] / len(dataset)
impurity -= prob_of_label ** 2
return impurity
def information_gain(starting_labels, split_labels):
info_gain = gini(starting_labels)
for subset in split_labels:
info_gain -= gini(subset) * len(subset)/len(starting_labels)
return info_gain
class Leaf:
def __init__(self, labels, value):
self.labels = Counter(labels)
self.value = value
class Internal_Node:
def __init__(self,
feature,
branches,
value):
self.feature = feature
self.branches = branches
self.value = value
def find_best_split_subset(dataset, labels, num_features):
features = np.random.choice(6, 3, replace=False)
best_gain = 0
best_feature = 0
for feature in features:
data_subsets, label_subsets = split(dataset, labels, feature)
gain = information_gain(labels, label_subsets)
if gain > best_gain:
best_gain, best_feature = gain, feature
return best_feature, best_gain
def find_best_split(dataset, labels):
best_gain = 0
best_feature = 0
for feature in range(len(dataset[0])):
data_subsets, label_subsets = split(dataset, labels, feature)
gain = information_gain(labels, label_subsets)
if gain > best_gain:
best_gain, best_feature = gain, feature
return best_feature, best_gain
def build_tree(data, labels, value = ""):
best_feature, best_gain = find_best_split(data, labels)
if best_gain < 0.00000001:
return Leaf(Counter(labels), value)
data_subsets, label_subsets = split(data, labels, best_feature)
branches = []
for i in range(len(data_subsets)):
branch = build_tree(data_subsets[i], label_subsets[i], data_subsets[i][0][best_feature])
branches.append(branch)
return Internal_Node(best_feature, branches, value)
def build_tree_forest(data,labels, n_features, value=""):
best_feature, best_gain = find_best_split_subset(data, labels, n_features)
if best_gain < 0.00000001:
return Leaf(Counter(labels), value)
data_subsets, label_subsets = split(data, labels, best_feature)
branches = []
for i in range(len(data_subsets)):
branch = build_tree_forest(data_subsets[i], label_subsets[i], n_features, data_subsets[i][0][best_feature])
branches.append(branch)
return Internal_Node(best_feature, branches, value)
def print_tree(node, spacing=""):
"""World's most elegant tree printing function."""
question_dict = {0: "Buying Price", 1:"Price of maintenance", 2:"Number of doors", 3:"Person Capacity", 4:"Size of luggage boot", 5:"Estimated Saftey"}
# Base case: we've reached a leaf
if isinstance(node, Leaf):
print (spacing + str(node.labels))
return
# Print the question at this node
print (spacing + "Splitting on " + question_dict[node.feature])
# Call this function recursively on the true branch
for i in range(len(node.branches)):
print (spacing + '--> Branch ' + node.branches[i].value+':')
print_tree(node.branches[i], spacing + " ")
def make_cars():
f = open("car.data", "r")
cars = []
for line in f:
cars.append(line.rstrip().split(","))
return cars
def change_data(data):
dicts = [{'vhigh' : 1.0, 'high' : 2.0, 'med' : 3.0, 'low' : 4.0},
{'vhigh' : 1.0, 'high' : 2.0, 'med' : 3.0, 'low' : 4.0},
{'2' : 1.0, '3' : 2.0, '4' : 3.0, '5more' : 4.0},
{'2' : 1.0, '4' : 2.0, 'more' : 3.0},
{'small' : 1.0, 'med' : 2.0, 'big' : 3.0},
{'low' : 1.0, 'med' : 2.0, 'high' : 3.0}]
for row in data:
for i in range(len(dicts)):
row[i] = dicts[i][row[i]]
return data
def classify(datapoint, tree):
if isinstance(tree, Leaf):
return max(tree.labels.items(), key=operator.itemgetter(1))[0]
value = datapoint[tree.feature]
for branch in tree.branches:
if branch.value == value:
return classify(datapoint, branch)
#return classify(datapoint, tree.branches[random.randint(0, len(tree.branches)-1)])
cars = make_cars()
random.shuffle(cars)
car_data = [x[:-1] for x in cars]
car_labels = [x[-1] for x in cars]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment