smazzanti/mrmr_fcq_fast.py

## mrmr_fcq_fast.py
import pandas as pd
from sklearn.feature_selection import f_regression

# inputs:
#    X: pandas.DataFrame, features
#    y: pandas.Series, target variable
#    K: number of features to select

# compute F-statistics and initialize correlation matrix
F = pd.Series(f_regression(X, y)[0], index = X.columns)
corr = pd.DataFrame(.00001, index = X.columns, columns = X.columns)

# initialize list of selected features and list of excluded features
selected = []
not_selected = X.columns.to_list()

# repeat K times
for i in range(K):

    # compute (absolute) correlations between the last selected feature and all the (currently) excluded features
    if i > 0:
        last_selected = selected[-1]
        corr.loc[not_selected, last_selected] = X[not_selected].corrwith(X[last_selected]).abs().clip(.00001)

    # compute FCQ score for all the (currently) excluded features (this is Formula 2)
    score = F.loc[not_selected] / corr.loc[not_selected, selected].mean(axis = 1).fillna(.00001)

    # find best feature, add it to selected and remove it from not_selected
    best = score.index[score.argmax()]
    selected.append(best)
    not_selected.remove(best)
	import pandas as pd
	from sklearn.feature_selection import f_regression

	# inputs:
	# X: pandas.DataFrame, features
	# y: pandas.Series, target variable
	# K: number of features to select

	# compute F-statistics and initialize correlation matrix
	F = pd.Series(f_regression(X, y)[0], index = X.columns)
	corr = pd.DataFrame(.00001, index = X.columns, columns = X.columns)

	# initialize list of selected features and list of excluded features
	selected = []
	not_selected = X.columns.to_list()

	# repeat K times
	for i in range(K):

	# compute (absolute) correlations between the last selected feature and all the (currently) excluded features
	if i > 0:
	last_selected = selected[-1]
	corr.loc[not_selected, last_selected] = X[not_selected].corrwith(X[last_selected]).abs().clip(.00001)

	# compute FCQ score for all the (currently) excluded features (this is Formula 2)
	score = F.loc[not_selected] / corr.loc[not_selected, selected].mean(axis = 1).fillna(.00001)

	# find best feature, add it to selected and remove it from not_selected
	best = score.index[score.argmax()]
	selected.append(best)
	not_selected.remove(best)