primaryobjects/id3.R

## readme.md

      
    Raw
  

              readme.md
            
          
    Building a Decision Tree with ID3

The following code constructs a decision tree from a data-set using the ID3 algorithm. The algorithm recursively calls itself, iterating over attributes from the data-set that provide the most information gain at each iteration. This process repeats until only leaf nodes remain. Finally, the decision tree is printed.
Output

           levelName  feature
1  Should_Play        Outlook
2   ¦--overcast          Play
3   ¦   °--TRUE              
4   ¦--rainy            Windy
5   ¦   ¦--false         Play
6   ¦   ¦   °--TRUE          
7   ¦   °--true          Play
8   ¦       °--FALSE         
9   °--sunny         Humidity
10      ¦--high          Play
11      ¦   °--FALSE         
12      °--normal        Play
13          °--TRUE  


## id3.R
library(data.tree)

entropy <- function(q) {
  # Calculate the entropy for a value.
  -1 * (q * log2(q) + (1 - q) * log2(1 - q))
}

positiveRatio <- function(data, outcomeCol = ncol(data)) {
  # Calculate the ratio of T by the total samples.
  positiveCount <- length(which(data[, outcomeCol] == T))
  sum(positiveCount / nrow(data))
}

gain <- function(data, attributeCol, outcomeCol = ncol(data), precision=3) {
  # Calculate the information gain for an attribute.
  # First, calculate the total entropy for this attribute by using its positive ratio.
  systemEntropy <- round(entropy(positiveRatio(data, outcomeCol)), precision)

  # Get the list of all T and all F outcomes.
  positives <- data[which(data[,outcomeCol] == T),]
  negatives <- data[which(data[,outcomeCol] == F),]

  # Split the attribute into groups by its possible values (sunny, overcast, rainy).
  attributeValues <- split(data, data[,attributeCol])

  # Sum the entropy for each positive attribute value.
  gains <- sum(sapply(attributeValues, function(attributeValue) {
    # Calculate the ratio for this attribute value by all measurements.
    itemRatio <- nrow(attributeValue) / nrow(data)

    # Calculate the entropy for this attribute value.
    outcomeEntropy <- entropy(length(which(attributeValue[,outcomeCol] == T)) / nrow(attributeValue))

    # Cast NaN to 0 and return the result.
    result <- itemRatio * outcomeEntropy
    round(ifelse(is.nan(result), 0, result), precision)
  }))

  # The information gain is the remainder from the attribute entropy minus the attribute value gains.
  systemEntropy - gains
}

pure <- function(data, outcomeCol = ncol(data)) {
  length(unique(data[, outcomeCol])) == 1
}

ID3 <- function(node, data, outcomeCol = ncol(data)) {
  node$obsCount <- nrow(data)

  # If the data-set contains all the same outcome values, then make a leaf.
  if (pure(data, outcomeCol)) {
    # Construct a leaf having the name of the attribute value.
    child <- node$AddChild(unique(data[,outcomeCol]))
    node$feature <- tail(names(data), 1)
    child$obsCount <- nrow(data)
    child$feature <- ''
  }
  else {
    # Chose the attribute with the highest information gain.
    gains <- sapply(colnames(data)[-outcomeCol], function(colName) {
      gain(data, which(colnames(data) == colName), outcomeCol)
    })

    feature <- names(gains)[gains == max(gains)][1]

    node$feature <- feature

    # Take the subset of the data-set having that attribute value.
    childObs <- split(data[,!(names(data) %in% feature)], data[,feature], drop = TRUE)

    for(i in 1:length(childObs)) {
      # Construct a child having the name of that attribute value.
      child <- node$AddChild(names(childObs)[i])

      # Call the algorithm recursively on the child and the subset.
      ID3(child, childObs[[i]])
    }
  }
}

# Read dataset.
data <- read.table('weather.tsv', header=T)
# Convert the last column to a boolean.
data[, ncol(data)] <- ifelse(tolower(data[, ncol(data)]) == 'yes', T, F)

# Test calculating information gain for all columns.
sapply(1:ncol(data), function(i) { print(gain(data, i)) })

# Train ID3 to build a decision tree.
tree <- Node$new('Should_Play')
ID3(tree, data)
print(tree, 'feature')

## weather.tsv
Outlook		Temperature	Humidity	Windy	Play
sunny		hot			high		false	no
sunny		hot			high		true	no
overcast	hot			high		false	yes
rainy		mild		high		false	yes
rainy		cool		normal		false	yes
rainy		cool		normal		true	no
overcast	cool		normal		true	yes
sunny		mild		high		false	no
sunny		cool		normal		false	yes
rainy		mild		normal		false	yes
sunny		mild		normal		true	yes
overcast	mild		high		true	yes
overcast	hot			normal		false	yes
rainy		mild		high		true	no
	library(data.tree)

	entropy <- function(q) {
	# Calculate the entropy for a value.
	-1 * (q * log2(q) + (1 - q) * log2(1 - q))
	}

	positiveRatio <- function(data, outcomeCol = ncol(data)) {
	# Calculate the ratio of T by the total samples.
	positiveCount <- length(which(data[, outcomeCol] == T))
	sum(positiveCount / nrow(data))
	}

	gain <- function(data, attributeCol, outcomeCol = ncol(data), precision=3) {
	# Calculate the information gain for an attribute.
	# First, calculate the total entropy for this attribute by using its positive ratio.
	systemEntropy <- round(entropy(positiveRatio(data, outcomeCol)), precision)

	# Get the list of all T and all F outcomes.
	positives <- data[which(data[,outcomeCol] == T),]
	negatives <- data[which(data[,outcomeCol] == F),]

	# Split the attribute into groups by its possible values (sunny, overcast, rainy).
	attributeValues <- split(data, data[,attributeCol])

	# Sum the entropy for each positive attribute value.
	gains <- sum(sapply(attributeValues, function(attributeValue) {
	# Calculate the ratio for this attribute value by all measurements.
	itemRatio <- nrow(attributeValue) / nrow(data)

	# Calculate the entropy for this attribute value.
	outcomeEntropy <- entropy(length(which(attributeValue[,outcomeCol] == T)) / nrow(attributeValue))

	# Cast NaN to 0 and return the result.
	result <- itemRatio * outcomeEntropy
	round(ifelse(is.nan(result), 0, result), precision)
	}))

	# The information gain is the remainder from the attribute entropy minus the attribute value gains.
	systemEntropy - gains
	}

	pure <- function(data, outcomeCol = ncol(data)) {
	length(unique(data[, outcomeCol])) == 1
	}

	ID3 <- function(node, data, outcomeCol = ncol(data)) {
	node$obsCount <- nrow(data)

	# If the data-set contains all the same outcome values, then make a leaf.
	if (pure(data, outcomeCol)) {
	# Construct a leaf having the name of the attribute value.
	child <- node$AddChild(unique(data[,outcomeCol]))
	node$feature <- tail(names(data), 1)
	child$obsCount <- nrow(data)
	child$feature <- ''
	}
	else {
	# Chose the attribute with the highest information gain.
	gains <- sapply(colnames(data)[-outcomeCol], function(colName) {
	gain(data, which(colnames(data) == colName), outcomeCol)
	})

	feature <- names(gains)[gains == max(gains)][1]

	node$feature <- feature

	# Take the subset of the data-set having that attribute value.
	childObs <- split(data[,!(names(data) %in% feature)], data[,feature], drop = TRUE)

	for(i in 1:length(childObs)) {
	# Construct a child having the name of that attribute value.
	child <- node$AddChild(names(childObs)[i])

	# Call the algorithm recursively on the child and the subset.
	ID3(child, childObs[[i]])
	}
	}
	}

	# Read dataset.
	data <- read.table('weather.tsv', header=T)
	# Convert the last column to a boolean.
	data[, ncol(data)] <- ifelse(tolower(data[, ncol(data)]) == 'yes', T, F)

	# Test calculating information gain for all columns.
	sapply(1:ncol(data), function(i) { print(gain(data, i)) })

	# Train ID3 to build a decision tree.
	tree <- Node$new('Should_Play')
	ID3(tree, data)
	print(tree, 'feature')
	Outlook Temperature Humidity Windy Play
	sunny hot high false no
	sunny hot high true no
	overcast hot high false yes
	rainy mild high false yes
	rainy cool normal false yes
	rainy cool normal true no
	overcast cool normal true yes
	sunny mild high false no
	sunny cool normal false yes
	rainy mild normal false yes
	sunny mild normal true yes
	overcast mild high true yes
	overcast hot normal false yes
	rainy mild high true no