Skip to content

Instantly share code, notes, and snippets.

@cmdelatorre
Last active February 22, 2018 16:03
Show Gist options
  • Star 13 You must be signed in to star a gist
  • Fork 2 You must be signed in to fork a gist
  • Save cmdelatorre/fd9ee43167f5cc1da130 to your computer and use it in GitHub Desktop.
Save cmdelatorre/fd9ee43167f5cc1da130 to your computer and use it in GitHub Desktop.
Desicion tree classifier (ID3) algortihm
"""
http://en.wikipedia.org/wiki/ID3_algorithm
http://www.cise.ufl.edu/~ddd/cap6635/Fall-97/Short-papers/2.htm
"""
from collections import namedtuple, Counter, defaultdict
from math import log2
UNCLASSIFIED_VALUE = -99
class ID3Classifier(object):
def __init__(self):
self.root_node = None
def fit(self, samples, target):
"""Create the decision tree."""
training_samples = [TrainingSample(s, t)
for s, t in zip(samples, target)]
predicting_features = list(range(len(samples[0])))
self.root_node = self.create_decision_tree(training_samples,
predicting_features)
def predict(self, X, allow_unclassified=False):
default_klass = 1
if allow_unclassified:
default_klass = UNCLASSIFIED_VALUE
predicted_klasses = []
for sample in X:
klass = None
current_node = self.root_node
while klass is None:
if current_node.is_leaf():
klass = current_node.klass
else:
key_value = sample[current_node.feature]
if key_value in current_node:
current_node = current_node[key_value]
else:
# UNCLASSIFIED_VALUE
klass = default_klass
predicted_klasses.append(klass)
return predicted_klasses
def score(self, X, target, allow_unclassified=True):
predicted = self.predict(X, allow_unclassified=allow_unclassified)
n_matches = sum(p == t for p, t in zip(predicted, target))
return 1.0 * n_matches / len(X)
def create_decision_tree(self, training_samples, predicting_features):
"""Recursively, create a desition tree and return the parent node."""
if not predicting_features:
# No more predicting features
default_klass = self.get_most_common_class(training_samples)
root_node = DecisionTreeLeaf(default_klass)
else:
klasses = [sample.klass for sample in training_samples]
if len(set(klasses)) == 1:
target_klass = training_samples[0].klass
root_node = DecisionTreeLeaf(target_klass)
else:
best_feature = self.select_best_feature(training_samples,
predicting_features,
klasses)
# Create the node to return and create the sub-tree.
root_node = DecisionTreeNode(best_feature)
best_feature_values = {s.sample[best_feature]
for s in training_samples}
for value in best_feature_values:
samples = [s for s in training_samples
if s.sample[best_feature] == value]
# Recursively, create a child node.
child = self.create_decision_tree(samples,
predicting_features)
root_node[value] = child
return root_node
@staticmethod
def get_most_common_class(trainning_samples):
"""Return the most common class in the given samples."""
klasses = [s.klass for s in trainning_samples]
counter = Counter(klasses)
k, = counter.most_common(n=1)
return k[0]
def select_best_feature(self, samples, features, klasses):
"""
Select, remove from list and return the feature that best classifies
the samples.
The best feature is the one with higher information gain for the
provided samples (mimizes entropy in klasses).
"""
gain_factors = [(self.information_gain(samples, feat, klasses), feat)
for feat in features]
gain_factors.sort()
best_feature = gain_factors[-1][1]
features.pop(features.index(best_feature))
return best_feature
def information_gain(self, samples, feature, klasses):
"""
Information gain is the measure of the difference in entropy from before
to after the samples are split on the given feature values. In other
words, how much uncertainty in the samples was reduced after splitting
them on the given feature.
"""
N = len(samples)
samples_partition = defaultdict(list)
for s in samples:
samples_partition[s.sample[feature]].append(s)
feature_entropy = 0.0
for partition in samples_partition.values():
sub_klasses = [s.klass for s in partition]
feature_entropy += (len(partition) / N) * self.entropy(sub_klasses)
return self.entropy(klasses) - feature_entropy
@staticmethod
def entropy(dataset):
"""Measure of the amount of uncertainty in the given dataset."""
N = len(dataset)
counter = Counter(dataset)
return sum(-1.0*(counter[k] / N)*log2(counter[k] / N) for k in counter)
TrainingSample = namedtuple('TrainingSample', ('sample', 'klass'))
class DecisionTreeNode(dict):
def __init__(self, feature, *args, **kwargs):
self.feature = feature
super(DecisionTreeNode, self).__init__(*args, **kwargs)
def is_leaf(self):
return False
def __repr__(self):
return "%s(%s)" % (self.__class__.__name__, self.feature)
class DecisionTreeLeaf(dict):
def __init__(self, klass, *args, **kwargs):
self.klass = klass
super(DecisionTreeLeaf, self).__init__(*args, **kwargs)
def is_leaf(self):
return True
def __repr__(self):
return "%s(%s)" % (self.__class__.__name__, self.klass)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment