Skip to content

Instantly share code, notes, and snippets.

@Kirili4ik
Last active July 21, 2021 13:34
Show Gist options
  • Save Kirili4ik/1364c8666b042854b35c04b5b82be720 to your computer and use it in GitHub Desktop.
Save Kirili4ik/1364c8666b042854b35c04b5b82be720 to your computer and use it in GitHub Desktop.
IV and WoE in Python (easy), Information value and Weight of evidence calculations function. Check https://contrib.scikit-learn.org/category_encoders/woe.html
def get_IV(df, feature, target):
lst = []
# optional
# df[feature] = df[feature].fillna("NULL")
unique_values = df[feature].unique()
for val in unique_values:
lst.append([feature, # Feature name
val, # Value of a feature (unique)
df[(df[feature] == val) & (df[target] == 0)].count()[feature], # Good (Fraud == 0)
df[(df[feature] == val) & (df[target] == 1)].count()[feature] # Bad (Fraud == 1)
])
data = pd.DataFrame(lst, columns=['Variable', 'Value', 'Good', 'Bad'])
total_bad = df[df[target] == 1].count()[feature]
total_good = df.shape[0] - total_bad
data['Distribution Good'] = data['Good']/ total_good
data['Distribution Bad'] = data['Bad'] / total_bad
data['WoE'] = np.log(data['Distribution Good'] / data['Distribution Bad'])
data = data.replace({'WoE': {np.inf: 0, -np.inf: 0}})
data['IV'] = data['WoE'] * (data['Distribution Good'] - data['Distribution Bad'])
data = data.sort_values(by=['Variable', 'Value'], ascending=[True, True])
data.index = range(len(data.index))
iv = data['IV'].sum()
return iv, data
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment