egemenzeytinci/feature_importance.py

## feature_importance.py
from rfpimp import permutation_importances
from sklearn.base import clone
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
import pandas as pd


def imp_df(column_names, importances):
    data = {
        'Feature': column_names,
        'Importance': importances,
    }
    df = pd.DataFrame(data) \
        .set_index('Feature') \
        .sort_values('Importance', ascending=False)

    return df


def r2(rf, X_train, y_train):
    return r2_score(y_train, rf.predict(X_train))


def drop_col_feat_imp(model, X_train, y_train, random_state=42):
    model_clone = clone(model)
    model_clone.random_state = random_state

    model_clone.fit(X_train, y_train)
    benchmark_score = model_clone.score(X_train, y_train)

    importances = []

    for col in X_train.columns:
        model_clone = clone(model)
        model_clone.random_state = random_state
        model_clone.fit(X_train.drop(col, axis=1), y_train)
        drop_col_score = model_clone.score(X_train.drop(col, axis=1), y_train)
        importances.append(benchmark_score - drop_col_score)

    return imp_df(X_train.columns, importances)


X = dataset.drop(['average_rating', 'title', 'genres'], axis=1)
y = dataset['average_rating']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.33, random_state=0)

rf = RandomForestRegressor(n_estimators=10).fit(X_train, y_train)
print('Random Forest Feature Importances:')
print(imp_df(X.columns, rf.feature_importances_))
print()

print('Permutation Feature Importance:')
print(permutation_importances(rf, X_train, y_train, r2))
print()

print('Drop-Col Feature Importance:')
print(drop_col_feat_imp(rf, X_train, y_train))
	from rfpimp import permutation_importances
	from sklearn.base import clone
	from sklearn.ensemble import RandomForestRegressor
	from sklearn.metrics import r2_score
	from sklearn.model_selection import train_test_split
	import pandas as pd


	def imp_df(column_names, importances):
	data = {
	'Feature': column_names,
	'Importance': importances,
	}
	df = pd.DataFrame(data) \
	.set_index('Feature') \
	.sort_values('Importance', ascending=False)

	return df


	def r2(rf, X_train, y_train):
	return r2_score(y_train, rf.predict(X_train))


	def drop_col_feat_imp(model, X_train, y_train, random_state=42):
	model_clone = clone(model)
	model_clone.random_state = random_state

	model_clone.fit(X_train, y_train)
	benchmark_score = model_clone.score(X_train, y_train)

	importances = []

	for col in X_train.columns:
	model_clone = clone(model)
	model_clone.random_state = random_state
	model_clone.fit(X_train.drop(col, axis=1), y_train)
	drop_col_score = model_clone.score(X_train.drop(col, axis=1), y_train)
	importances.append(benchmark_score - drop_col_score)

	return imp_df(X_train.columns, importances)


	X = dataset.drop(['average_rating', 'title', 'genres'], axis=1)
	y = dataset['average_rating']
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.33, random_state=0)

	rf = RandomForestRegressor(n_estimators=10).fit(X_train, y_train)
	print('Random Forest Feature Importances:')
	print(imp_df(X.columns, rf.feature_importances_))
	print()

	print('Permutation Feature Importance:')
	print(permutation_importances(rf, X_train, y_train, r2))
	print()

	print('Drop-Col Feature Importance:')
	print(drop_col_feat_imp(rf, X_train, y_train))