Skip to content

Instantly share code, notes, and snippets.

@nmattia
Created May 30, 2018 18:54
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save nmattia/87affb99f175cefc3975f402b9087496 to your computer and use it in GitHub Desktop.
Save nmattia/87affb99f175cefc3975f402b9087496 to your computer and use it in GitHub Desktop.
Decision trees python
import pandas as pd
import math as math
def loop(df, target):
if df.columns.size > 1:
gains = info_gains(df, target)
res = max(gains, key=gains.get)
df.groupby(res).apply(lambda v: loop(v, target))
def log2(x):
return math.log(x, 2)
def info_gains(df, target):
res = {}
for k in df: res[k] = gain_of_attr(df, k, target)
res.pop(target, None)
return res
def gain_of_attr(df, attr, target):
grouped = df.groupby(attr)
res = 0
for k,v in grouped:
p_t = grouped.indices[k].size/float(df[target].size)
e_k = entropy_of(v[target])
res += p_t * e_k
return entropy_of(df[target]) - res
def entropy_of(col):
props = proportion_of_attr(col)
ixs = props.index
xs = ixs.map(lambda ix: props[ix]*log2(props[ix]))
return -sum(xs)
def proportion_of_attr(col):
counts = col.size
return col.value_counts().apply(lambda x: x/float(counts))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment