Skip to content

Instantly share code, notes, and snippets.

@kayitt
Created April 6, 2020 17:52
Show Gist options
  • Save kayitt/bf8a99d064e4e0306364ab39647f6e75 to your computer and use it in GitHub Desktop.
Save kayitt/bf8a99d064e4e0306364ab39647f6e75 to your computer and use it in GitHub Desktop.
Feature importance for logistic regression
import pandas as pd
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
import numpy as np
model = LogisticRegression()
# model.fit(...)
my_dict = dict(zip(model.named_steps.tfidf.get_feature_names(), model.named_steps.classifier.coef_.T))
coefs = pd.DataFrame.from_dict(my_dict, orient='index')
coefs.columns = model.named_steps.classifier.classes_
for category in coefs.columns:
# features "in favor" are those with the largest coefficients
vals = list(coefs[category].nlargest(10).values) + list(
coefs[category].nsmallest(5).sort_values(ascending=False).values)
# features "against" are those with the smallest coefficients
names = list(coefs[category].nlargest(10).index) + list(
coefs[category].nsmallest(5).sort_values(ascending=False).index)
# features "in favour" of the category are colored green, those "against" are colored red
colors = ['green' if x > 0 else 'red' for x in vals]
# plot
vals.reverse()
names.reverse()
fig = plt.figure(figsize=(15, 10))
pos = np.arange(len(vals)) + .5
plt.barh(pos, vals, align='center', color=colors)
plt.yticks(pos, names)
title = f'Local explanation for class {category}'
plt.title(title)
plt.show()
@gosia-malgosia
Copy link

gosia-malgosia commented Apr 6, 2020

Or for more generic cases:

# create a df with model coefs
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
# you need three things:
my_feature_names = my_tfidf.get_feature_names() # list of feature names (len = number_of_features)
my_weights = my_OvR_classifier.coef_.T # the classifer must be OvR and of the shape 
                                       # equal to (number_of_features x number_of classes)
my_labels = my_OvR_classifier.classes_ # or if you used LabelEncoder: label_encoder_obj.classes_
# and then you put them into a df:
my_dict = dict(zip(my_feature_names, my_weights))
coef_df = pd.DataFrame.from_dict(my_dict, orient='index')
coef_df.columns = my_labels
# and the coef_df will be your huge table showing you the weight of each feature for each label
# to print a chart you can use this function:
# the example of calling it: show_fetures(['cats', 'dogs'], 20)
def show_features(category_list, n):
    # category_list: list of categories to print the chart for, example: ['cats', 'dogs']
    # n: number of most important features to present in the chart 
    # we always print the 5 lowest features. You can deactivate it by removing 
    # the parts starting with '+ list(coef_df[category].nsmallest(5)'
    for category in category_list:
        fig = plt.figure(figsize=(10,n/4))
        vals = list(coef_df[category].nlargest(n).values) + list(coef_df[category].nsmallest(5).sort_values(ascending=False).values)
        names = list(coef_df[category].nlargest(n).index) + list(coef_df[category].nsmallest(5).sort_values(ascending=False).index)
        vals.reverse()
        names.reverse()
        colors = ['green' if x > 0 else 'red' for x in vals]
        pos = np.arange(len(vals)) + .5
        plt.barh(pos, vals, align='center', color=colors)
        plt.yticks(pos, names)
        title = f'Local explanation for class {category}'
        plt.title(title)
        plt.show()
    return fig

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment