finlytics-hub/ODI_feature_selection.py

## ODI_feature_selection.py
# since the player names are not useful for our feature selecting and model training steps, we will drop them after creating a backup of the DF
final_data.drop(columns = final_data.loc[:, 'Country_Player_1': 'Opposition_Player_12'].columns.values, inplace = True)


# Categorical Feature Selection
X = final_data.drop(columns = ['Result'], axis = 1)
y = final_data['Result']

# define an empty dictionary to store chi-test results
chi2_check = {}
# define a list of categorical columns to be evaluated
categorical_columns = ['Country', 'Opposition', 'Home/Away', 'Ground', 'Toss Won?']
# loop over each column to calculate chi statistic against the target variable
for column in categorical_columns:
    chi, p, dof, ex = chi2_contingency(pd.crosstab(y, X[column]))
    chi2_check.setdefault('Feature',[]).append(column)
    chi2_check.setdefault('p-value',[]).append(np.round(p, 4))
# convert the dictionary to a DF
chi2_result = pd.DataFrame(chi2_check).sort_values('p-value')


# Numerical Feature Selection
# list of numerical columns
numerical_columns = final_data.select_dtypes('number').columns.values
# keep only numerical columns
X = final_data[numerical_columns]
# calculate the F-Statistic and corresponding p-values for each numerical column
F, pval = f_classif(X, final_data['Result'])
# store the results in a DF
F_Values = pd.DataFrame([np.round(F, 2), np.round(pval, 4)], columns = numerical_columns, index = ['F-Statistic', 'p-value'], dtype = float).T.sort_values('p-value')
	# since the player names are not useful for our feature selecting and model training steps, we will drop them after creating a backup of the DF
	final_data.drop(columns = final_data.loc[:, 'Country_Player_1': 'Opposition_Player_12'].columns.values, inplace = True)


	# Categorical Feature Selection
	X = final_data.drop(columns = ['Result'], axis = 1)
	y = final_data['Result']

	# define an empty dictionary to store chi-test results
	chi2_check = {}
	# define a list of categorical columns to be evaluated
	categorical_columns = ['Country', 'Opposition', 'Home/Away', 'Ground', 'Toss Won?']
	# loop over each column to calculate chi statistic against the target variable
	for column in categorical_columns:
	chi, p, dof, ex = chi2_contingency(pd.crosstab(y, X[column]))
	chi2_check.setdefault('Feature',[]).append(column)
	chi2_check.setdefault('p-value',[]).append(np.round(p, 4))
	# convert the dictionary to a DF
	chi2_result = pd.DataFrame(chi2_check).sort_values('p-value')


	# Numerical Feature Selection
	# list of numerical columns
	numerical_columns = final_data.select_dtypes('number').columns.values
	# keep only numerical columns
	X = final_data[numerical_columns]
	# calculate the F-Statistic and corresponding p-values for each numerical column
	F, pval = f_classif(X, final_data['Result'])
	# store the results in a DF
	F_Values = pd.DataFrame([np.round(F, 2), np.round(pval, 4)], columns = numerical_columns, index = ['F-Statistic', 'p-value'], dtype = float).T.sort_values('p-value')