Skip to content

Instantly share code, notes, and snippets.

@smazzanti
Last active May 19, 2022 16:40
Show Gist options
  • Save smazzanti/8f5466acb231f54559afcf149c89c172 to your computer and use it in GitHub Desktop.
Save smazzanti/8f5466acb231f54559afcf149c89c172 to your computer and use it in GitHub Desktop.
import pandas as pd
from sklearn.feature_selection import f_regression
# inputs:
# X: pandas.DataFrame, features
# y: pandas.Series, target variable
# K: number of features to select
# compute F-statistics and initialize correlation matrix
F = pd.Series(f_regression(X, y)[0], index = X.columns)
corr = pd.DataFrame(.00001, index = X.columns, columns = X.columns)
# initialize list of selected features and list of excluded features
selected = []
not_selected = X.columns.to_list()
# repeat K times
for i in range(K):
# compute (absolute) correlations between the last selected feature and all the (currently) excluded features
if i > 0:
last_selected = selected[-1]
corr.loc[not_selected, last_selected] = X[not_selected].corrwith(X[last_selected]).abs().clip(.00001)
# compute FCQ score for all the (currently) excluded features (this is Formula 2)
score = F.loc[not_selected] / corr.loc[not_selected, selected].mean(axis = 1).fillna(.00001)
# find best feature, add it to selected and remove it from not_selected
best = score.index[score.argmax()]
selected.append(best)
not_selected.remove(best)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment