Last active
July 21, 2021 13:34
-
-
Save Kirili4ik/1364c8666b042854b35c04b5b82be720 to your computer and use it in GitHub Desktop.
IV and WoE in Python (easy), Information value and Weight of evidence calculations function. Check https://contrib.scikit-learn.org/category_encoders/woe.html
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def get_IV(df, feature, target): | |
lst = [] | |
# optional | |
# df[feature] = df[feature].fillna("NULL") | |
unique_values = df[feature].unique() | |
for val in unique_values: | |
lst.append([feature, # Feature name | |
val, # Value of a feature (unique) | |
df[(df[feature] == val) & (df[target] == 0)].count()[feature], # Good (Fraud == 0) | |
df[(df[feature] == val) & (df[target] == 1)].count()[feature] # Bad (Fraud == 1) | |
]) | |
data = pd.DataFrame(lst, columns=['Variable', 'Value', 'Good', 'Bad']) | |
total_bad = df[df[target] == 1].count()[feature] | |
total_good = df.shape[0] - total_bad | |
data['Distribution Good'] = data['Good']/ total_good | |
data['Distribution Bad'] = data['Bad'] / total_bad | |
data['WoE'] = np.log(data['Distribution Good'] / data['Distribution Bad']) | |
data = data.replace({'WoE': {np.inf: 0, -np.inf: 0}}) | |
data['IV'] = data['WoE'] * (data['Distribution Good'] - data['Distribution Bad']) | |
data = data.sort_values(by=['Variable', 'Value'], ascending=[True, True]) | |
data.index = range(len(data.index)) | |
iv = data['IV'].sum() | |
return iv, data |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment