Instantly share code, notes, and snippets.

# iamaziz/entropy_gain.py

Last active November 14, 2023 02:41
Show Gist options
• Save iamaziz/02491e36490eb05a30f8 to your computer and use it in GitHub Desktop.
Calculate Entropy and Information Gain for Decision Tree Learning
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode characters
 # -*- coding: utf-8 -*- # calculating the Entropy and Information Gain for: Learning with Trees # by: Aziz Alto # see Information Gain: # http://www.autonlab.org/tutorials/infogain.html from __future__ import division from math import log def entropy(pi): ''' return the Entropy of a probability distribution: entropy(p) = − SUM (Pi * log(Pi) ) defintion: entropy is a metric to measure the uncertainty of a probability distribution. entropy ranges between 0 to 1 Low entropy means the distribution varies (peaks and valleys). High entropy means the distribution is uniform. See: http://www.cs.csi.cuny.edu/~imberman/ai/Entropy%20and%20Information%20Gain.htm ''' total = 0 for p in pi: p = p / sum(pi) if p != 0: total += p * log(p, 2) else: total += 0 total *= -1 return total def gain(d, a): ''' return the information gain: gain(D, A) = entropy(D)−􏰋 SUM ( |Di| / |D| * entropy(Di) ) ''' total = 0 for v in a: total += sum(v) / sum(d) * entropy(v) gain = entropy(d) - total return gain # TEST ###__ example 1 (AIMA book, fig18.3) # set of example of the dataset willWait = [6, 6] # Yes, No # attribute, number of members (feature) patron = [ [4,0], [2,4], [0,2] ] # Some, Full, None print(gain(willWait, patron)) ###__ example 2 (playTennis homework) # set of example of the dataset playTennis = [9, 5] # Yes, No # attribute, number of members (feature) outlook = [ [4, 0], # overcase [2, 3], # sunny [3, 2] # rain ] temperature = [ [2, 2], # hot [3, 1], # cool [4, 2] # mild ] humidity = [ [3, 4], # high [6, 1] # normal ] wind = [ [6, 2], # weak [3, 3] # strong ] print(gain(playTennis, outlook)) print(gain(playTennis, temperature)) print(gain(playTennis, humidity)) print(gain(playTennis, wind))