Skip to content

Instantly share code, notes, and snippets.

@ah89
Last active July 16, 2018 16:35
Show Gist options
  • Save ah89/2a8d2fc2e6b34e5911627eacb699bd18 to your computer and use it in GitHub Desktop.
Save ah89/2a8d2fc2e6b34e5911627eacb699bd18 to your computer and use it in GitHub Desktop.
greedy and decision tree
import numpy as np
import pandas as pd
import copy
from sklearn import tree
import graphviz
class Greedy:
def __init__(self, set_vlaues, given_tuple, metric):
self.given_tuple = given_tuple
self.set_vlaues = set_vlaues
self.metric = metric
def score_m(self, s):
t_m = self.given_tuple[self.metric].values
S_m = s[self.metric].values
sd = np.std(S_m)
ret = float("inf")
if sd != 0:
ret = float((np.abs(
t_m - (float(np.sum(S_m)) / S_m.shape[0])) / np.std(
S_m)).flatten())
return ret
def s_stars(self, S, all_sets):
temp = []
s_star = set(list(S.index.values))
scores = []
star_set_score = self.score_m(S.loc[s_star])
for x in s_star:
if x != t_index:
diff = set(s_star - set([x]))
diff_score = self.score_m(S.loc[diff])
if diff_score >= star_set_score:
temp.append(list(diff))
scores.append(diff_score)
if len(temp) != 0:
indexes_of_max_scores = np.argwhere(
scores == np.amax(scores)).flatten().tolist()
for i in indexes_of_max_scores:
self.s_stars(S.loc[temp[i]], all_sets)
else:
cnt = 0
for df in all_sets:
if df.equals(S):
cnt = 1
if cnt == 0:
all_sets.append(S)
class DecisionTree:
def __init__(self, data, starsets, id_col_name, gtuple, cat=0):
self.tuple = gtuple
self.data = data
self.starsets = starsets
self.id_col_name = id_col_name
self.columns_name = list(self.data.columns.values)
if cat == 1:
self.conversion_dicts()
self.data2num()
else:
self.x = self.data.drop(['survived','id'], axis=1).as_matrix()
self.create_labels()
def conversion_dicts(self):
self.cat2num = {}
self.num2cat = {}
for col in self.columns_name:
col_cat2num = {}
col_num2cat = {}
col_values = list(set(self.data[col].tolist()))
for ind in range(len(col_values)):
col_cat2num.update({col_values[ind]:ind})
col_num2cat.update({ind:col_values[ind]})
self.cat2num.update({col:col_cat2num})
self.num2cat.update({col:col_num2cat})
def data2num(self):
self.num_data = copy.copy(self.data)
for col in self.columns_name:
if col != self.id_col_name:
for ind in self.data.index.values:
self.num_data.set_value(ind, col, self.cat2num[col][
self.data.loc[ind][col]])
self.x = self.num_data.as_matrix()
def create_labels(self):
self.labels = []
for df in self.starsets:
y = [0] * len(self.data.index.values)
for ind in list(df.index.values):
y[self.data.loc[self.data[self.id_col_name] == df.loc[ind][
self.id_col_name]].index.values[0]] = 1
y[self.tuple.index.values[0]] = 0
self.labels.append(y)
def decision_tree(self, index):
clf = tree.DecisionTreeClassifier()
clf = clf.fit(self.x, self.labels[index])
dot_data = tree.export_graphviz(clf,
out_file="decisionTree for " + str(
index) + ".dot",
feature_names=self.data.drop(['survived','id'], axis=1).columns.values.tolist())
graph = graphviz.Source(dot_data)
# graph.render("decisionTreefor" + str(index))
# Test
data = pd.read_csv("titanic_clean_10.csv")
for col_name in data.columns:
if(data[col_name].dtype == 'object'):
data[col_name]= data[col_name].astype('category')
data[col_name] = data[col_name].cat.codes
print data
# data = data.drop(["id"])
# data["age"] = pd.Series(pd.cut(data['age'], bins= 5, retbins=False, labels = ["child", "youth", "senior", "old", "very_old"]))
# data["fare"] = pd.Series(pd.cut(data['fare'], bins= 4, retbins=False, labels = ["very_low", "low", "high", "very_high"]))
data = data.drop(['name', 'body', 'boat', 'home.dest', 'cabin', 'ticket', 'sibsp', 'parch'], axis=1)
# data = data.drop(['name', 'body', 'home.dest', 'boat', 'cabin'], axis=1)
data = data[data["survived"].notnull()]
# print(data)
S = copy.copy(data)
t_index = 3
t = data.loc[t_index:t_index, :]
print(S.index.values)
print(t)
all_stars_sets = []
gr = Greedy(S, t, "survived")
gr.s_stars(gr.set_vlaues, all_stars_sets)
# print(len(all_stars_sets))
# print all_stars_sets
dt = DecisionTree(data, all_stars_sets, 'id', t)
for i in range(len(all_stars_sets)):
dt.decision_tree(i)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment