|
library(data.tree) |
|
|
|
entropy <- function(q) { |
|
# Calculate the entropy for a value. |
|
-1 * (q * log2(q) + (1 - q) * log2(1 - q)) |
|
} |
|
|
|
positiveRatio <- function(data, outcomeCol = ncol(data)) { |
|
# Calculate the ratio of T by the total samples. |
|
positiveCount <- length(which(data[, outcomeCol] == T)) |
|
sum(positiveCount / nrow(data)) |
|
} |
|
|
|
gain <- function(data, attributeCol, outcomeCol = ncol(data), precision=3) { |
|
# Calculate the information gain for an attribute. |
|
# First, calculate the total entropy for this attribute by using its positive ratio. |
|
systemEntropy <- round(entropy(positiveRatio(data, outcomeCol)), precision) |
|
|
|
# Get the list of all T and all F outcomes. |
|
positives <- data[which(data[,outcomeCol] == T),] |
|
negatives <- data[which(data[,outcomeCol] == F),] |
|
|
|
# Split the attribute into groups by its possible values (sunny, overcast, rainy). |
|
attributeValues <- split(data, data[,attributeCol]) |
|
|
|
# Sum the entropy for each positive attribute value. |
|
gains <- sum(sapply(attributeValues, function(attributeValue) { |
|
# Calculate the ratio for this attribute value by all measurements. |
|
itemRatio <- nrow(attributeValue) / nrow(data) |
|
|
|
# Calculate the entropy for this attribute value. |
|
outcomeEntropy <- entropy(length(which(attributeValue[,outcomeCol] == T)) / nrow(attributeValue)) |
|
|
|
# Cast NaN to 0 and return the result. |
|
result <- itemRatio * outcomeEntropy |
|
round(ifelse(is.nan(result), 0, result), precision) |
|
})) |
|
|
|
# The information gain is the remainder from the attribute entropy minus the attribute value gains. |
|
systemEntropy - gains |
|
} |
|
|
|
pure <- function(data, outcomeCol = ncol(data)) { |
|
length(unique(data[, outcomeCol])) == 1 |
|
} |
|
|
|
ID3 <- function(node, data, outcomeCol = ncol(data)) { |
|
node$obsCount <- nrow(data) |
|
|
|
# If the data-set contains all the same outcome values, then make a leaf. |
|
if (pure(data, outcomeCol)) { |
|
# Construct a leaf having the name of the attribute value. |
|
child <- node$AddChild(unique(data[,outcomeCol])) |
|
node$feature <- tail(names(data), 1) |
|
child$obsCount <- nrow(data) |
|
child$feature <- '' |
|
} |
|
else { |
|
# Chose the attribute with the highest information gain. |
|
gains <- sapply(colnames(data)[-outcomeCol], function(colName) { |
|
gain(data, which(colnames(data) == colName), outcomeCol) |
|
}) |
|
|
|
feature <- names(gains)[gains == max(gains)][1] |
|
|
|
node$feature <- feature |
|
|
|
# Take the subset of the data-set having that attribute value. |
|
childObs <- split(data[,!(names(data) %in% feature)], data[,feature], drop = TRUE) |
|
|
|
for(i in 1:length(childObs)) { |
|
# Construct a child having the name of that attribute value. |
|
child <- node$AddChild(names(childObs)[i]) |
|
|
|
# Call the algorithm recursively on the child and the subset. |
|
ID3(child, childObs[[i]]) |
|
} |
|
} |
|
} |
|
|
|
# Read dataset. |
|
data <- read.table('weather.tsv', header=T) |
|
# Convert the last column to a boolean. |
|
data[, ncol(data)] <- ifelse(tolower(data[, ncol(data)]) == 'yes', T, F) |
|
|
|
# Test calculating information gain for all columns. |
|
sapply(1:ncol(data), function(i) { print(gain(data, i)) }) |
|
|
|
# Train ID3 to build a decision tree. |
|
tree <- Node$new('Should_Play') |
|
ID3(tree, data) |
|
print(tree, 'feature') |