Skip to content

Instantly share code, notes, and snippets.

View Ayushijain09's full-sized avatar
🎯
Focusing

Ayushi Jain Ayushijain09

🎯
Focusing
View GitHub Profile
@Ayushijain09
Ayushijain09 / Feature Selection Techniques -2
Last active July 17, 2020 12:55
Feature Selection Techniques
1. Mutual Information
2. Chi Square
3. ANOVA
4. Pearson , Spearman, Kendall Correlation
5. Tree Model
6. Sequential Feature Selection
7. Variance Threshold
8. Recursive Feature Elimination
@Ayushijain09
Ayushijain09 / tree_regression.py
Created July 16, 2020 20:19
Tree Model for Feature Selection
from sklearn.tree import DecisionTreeRegressor
model = DecisionTreeRegressor()
# fit the model
model.fit(X_train, y_train)
# get importance
importance = model.feature_importances_
# summarize feature importance
impList = zip(X_train.columns, importance)
for feature in sorted(impList, key = lambda t: t[1], reverse=True):
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV
lasso= Lasso()
parameters = {'alpha':[1e-15, 1e-10, 1e-8, 1e-4,1e-3,1e-2,1,3,5]}
lasso_model = GridSearchCV(lasso, parameters, scoring = 'r2',cv=5)
lasso_model.fit(X_train,y_train)
pred = lasso_model.predict(X_test)
print(lasso_model.best_params_) #output {'alpha': 0.001}
print(lasso_model.best_score_) #output 0.8630550401365724
@Ayushijain09
Ayushijain09 / RFE_Regression.py
Last active July 16, 2020 02:57
Regression with RFE
from sklearn.feature_selection import RFE
lm = LinearRegression()
rfe1 = RFE(lm, 20) # RFE with 20 features
# Fit on train and test data with 20 features
X_train_new = rfe1.fit_transform(X_train, y_train)
X_test_new = rfe1.transform(X_test)
# Print the boolean results
print(rfe1.support_) # Output [False False False False True False False False True False False...]
@Ayushijain09
Ayushijain09 / Sequential_Forward_Selection.py
Created July 15, 2020 14:08
Sequential Forward Selection
from mlxtend.feature_selection import SequentialFeatureSelector
sfs = SequentialFeatureSelector(LinearRegression(), # cv = k-fold cross validation, floating is another extension of SFS, not used here
k_features=10,
forward=True,
floating=False,
scoring='accuracy',
cv=2)
sfs = sfs.fit(X_train, y_train)
selected_features = x_train.columns[list(sfs.k_feature_idx_)]
@Ayushijain09
Ayushijain09 / ANOVA.py
Last active July 15, 2020 12:08
Anova feature selection
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
fvalue_selector = SelectKBest(f_regression, k=20) #select features with 20 best ANOVA F-Values
X_train_new = fvalue_selector.fit_transform(X_train, y_train)
print(X_train.shape, X_train_new.shape) #output (143, 59) (143, 20)
@Ayushijain09
Ayushijain09 / mutual_info_regression.py
Last active July 15, 2020 11:59
Mutual Information for Regression
from sklearn.feature_selection import mutual_info_regression
from sklearn.feature_selection import SelectKBest
selector = SelectKBest(mutual_info_regression, k=10)
X_train_new = selector.fit_transform(X_train, y_train) #Applying transformation to the training set
#to get names of the selected features
mask = selector.get_support() # Output array([False, False, True, True, True, False ....])
print(selector.scores_) #Output array([0.16978127, 0.01829886, 0.45461366, 0.55126343, 0.66081217, 0.27715287 ....])
new_features = X_train.columns[mask]
print(df_train.isnull().sum()/len(df_train)*100).nlargest())
#output => Returns the 5 largest values from the series. No missing values in automobile dataset, so all shows 0%.
"""symboling 0.0
doornumber 0.0
wheelbase 0.0
carlength 0.0
carwidth 0.0
dtype: float64"""
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
X_train = X_train.astype(int)
chi2_features = SelectKBest(chi2 , k=12)
X_kbest_features = chi2_features.fit_transform(X_train, y_train)
from sklearn.feature_selection import VarianceThreshold
print(df_train.shape) #output (143, 59)
var_filter = VarianceThreshold(threshold = 0.0)
train = var_filter.fit_transform(df_train)
#to get the count of features that are not constant
print(train.shape()) # output (143, 56)
#or
print(len(df_train.columns[var_filter.get_support()])) #output 56