Created
January 5, 2022 12:10
-
-
Save EvanGertis/878de25056eb1d6aa076c6a247b5fd31 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def entropy(s): | |
print("calculating the entropy for s") | |
print("*****************************") | |
print(s) | |
print("*****************************") | |
# initialize ent | |
ent = 0 | |
# calculate the number of classes in s | |
numberOfClasses = s['Class'].nunique() | |
print(f'Number of classes for dataset: {numberOfClasses}') | |
value_counts = s['Class'].value_counts() | |
p = [] | |
for i in range(0,numberOfClasses): | |
n = s['Class'].count() | |
# calculate the frequency of class_i in S1 | |
print(f'p{i} {value_counts.iloc[i]}/{n}') | |
f = value_counts.iloc[i] | |
pi = f/n | |
p.append(pi) | |
print(p) | |
for pi in p: | |
ent += -pi*log2(pi) | |
return ent | |
# This method discretizes attribute A1 | |
# If the information gain is 0, i.e the number of | |
# distinct class is 1 or | |
# If min f/ max f < 0.5 and the number of distinct values is floor(n/2) | |
# Then that partition stops splitting. | |
# This method discretizes s A1 | |
# If the information gain is 0, i.e the number of | |
# distinct class is 1 or | |
# If min f/ max f < 0.5 and the number of distinct values is floor(n/2) | |
# Then that partition stops splitting. | |
def entropy_discretization(s): | |
I = {} | |
i = 0 | |
n = s.nunique()['Class'] | |
s1 = pd.DataFrame() | |
s2 = pd.DataFrame() | |
distinct_values = s['A1'].value_counts().index | |
information_gain_indicies = [] | |
print(f'The unique values for dataset s["A1"] are {distinct_values}') | |
for i in distinct_values: | |
# Step 1: pick a threshold | |
threshold = i | |
print(f'Using threshold {threshold}') | |
# Step 2: Partititon the data set into two parttitions | |
s1 = s[s['Class'] < threshold] | |
print("s1 after spitting") | |
print(s1) | |
print("******************") | |
s2 = s[s['Class'] >= threshold] | |
print("s2 after spitting") | |
print(s2) | |
print("******************") | |
print("******************") | |
print("calculating maxf") | |
print(f" maxf {maxf(s['Class'])}") | |
print("******************") | |
print("******************") | |
print("calculating minf") | |
print(f" maxf {minf(s['Class'])}") | |
print("******************") | |
# Step 3: calculate the information gain. | |
informationGain = information_gain(s1,s2,s) | |
I.update({f'informationGain_{i}':informationGain,f'threshold_{i}': threshold}) | |
print(f'added informationGain_{i}: {informationGain}, threshold_{i}: {threshold}') | |
information_gain_indicies.append(i) | |
print(f"Checking condition a if {s1.nunique()['Class']} == {1}") | |
if (s1.nunique()['Class'] == 1): | |
break | |
print(f"Checking condition b {maxf(s1['Class'])}/{minf(s1['Class'])} < {0.5} {s1.nunique()['Class']} == {floor(n/2)}") | |
if (maxf(s1['Class'])/minf(s1['Class']) < 0.5) and (s1.nunique()['Class'] == floor(n/2)): | |
print(f"Condition b is met{maxf(s1['Class'])}/{minf(s1['Class'])} < {0.5} {s1.nunique()['Class']} == {floor(n/2)}") | |
break | |
print("Elements in I") | |
print(I) | |
print("*****************************") | |
# Step 5: calculate the max threshold | |
n = int(((len(I)/2)-1)) | |
print("Calculating maximum threshold") | |
print("*****************************") | |
maxInformationGain = 0 | |
maxThreshold = 0 | |
for i in information_gain_indicies: | |
print(f"if({I[f'informationGain_{i}']} > {maxInformationGain})") | |
if(I[f'informationGain_{i}'] > maxInformationGain): | |
maxInformationGain = I[f'informationGain_{i}'] | |
maxThreshold = I[f'threshold_{i}'] | |
print(f'maxThreshold: {maxThreshold}, maxInformationGain: {maxInformationGain}') | |
# replace values | |
print(f" {s1['A1'].value_counts().index}") | |
for i in s1['A1'].value_counts().index: | |
print(f"s1['A1'].replace({i},1)") | |
s1['A1'] = s1['A1'].replace(i,1) | |
print(f" {s2['A1'].value_counts().index}") | |
for i in s2['A1'].value_counts().index: | |
print(f"s2['A1'].replace({i},2)") | |
s2['A1'] = s2['A1'].replace(i,2) | |
print("s1 after replacing values") | |
print(s1) | |
print("******************") | |
print("s2 after replacing values") | |
print(s2) | |
print("******************") | |
partitions = [s1,s2] | |
s = pd.concat(partitions) | |
# Step 6: keep the partitions of S based on the value of threshold_i | |
return s #maxPartition(maxInformationGain,maxThreshold,s,s1,s2) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment