Ayushi Jain Ayushijain09

## Automobile_Multiple_Linear_Regression.ipynb

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                Ayushijain09
                / Automobile_Multiple_Linear_Regression.ipynb
            
            
              Last active
              July 10, 2020 12:03
            
          
      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## lr.ipynb

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                Ayushijain09
                / lr.ipynb
            
            
              Created
              July 10, 2020 12:26
            
          
      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## Pearson_Spearman_Kendall_Correlation.py
#USING SCIPY
from scipy.stats import spearmanr
from scipy.stats import pearsonr
from scipy.stats import kendalltau

coef, p = pearsonrr(x, y)    #Pearson's r
coef, p = spearmanr(x, y)    # Spearman's rho
coef, p = kendalltau(x, y)   # Kendall's tau

#USING PANDAS

## VarianceThreshold.py
from sklearn.feature_selection import VarianceThreshold
print(df_train.shape)     #output (143, 59)
var_filter = VarianceThreshold(threshold = 0.0)
train = var_filter.fit_transform(df_train)
#to get the count of features that are not constant
print(train.shape())    # output (143, 56)
#or
print(len(df_train.columns[var_filter.get_support()]))  #output 56

## Chi2.py
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
X_train = X_train.astype(int)
chi2_features = SelectKBest(chi2 , k=12)
X_kbest_features = chi2_features.fit_transform(X_train, y_train)

## Missing_Values.py
print(df_train.isnull().sum()/len(df_train)*100).nlargest())
#output => Returns the 5 largest values from the series. No missing values in automobile dataset, so all shows 0%.
"""symboling     0.0
   doornumber    0.0
   wheelbase     0.0
   carlength     0.0
   carwidth      0.0
   dtype: float64"""

## mutual_info_regression.py
from sklearn.feature_selection import mutual_info_regression
from sklearn.feature_selection import SelectKBest
selector = SelectKBest(mutual_info_regression, k=10)
X_train_new = selector.fit_transform(X_train, y_train)  #Applying transformation to the training set
#to get names of the selected features
mask = selector.get_support()     # Output   array([False, False,  True,  True,  True, False ....])

print(selector.scores_)     #Output array([0.16978127, 0.01829886, 0.45461366, 0.55126343, 0.66081217, 0.27715287 ....])

new_features = X_train.columns[mask]

## ANOVA.py
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression

fvalue_selector = SelectKBest(f_regression, k=20)  #select features with 20 best ANOVA F-Values
X_train_new = fvalue_selector.fit_transform(X_train, y_train)
print(X_train.shape, X_train_new.shape)     #output (143, 59) (143, 20)

## Sequential_Forward_Selection.py
from mlxtend.feature_selection import SequentialFeatureSelector
sfs = SequentialFeatureSelector(LinearRegression(),         # cv = k-fold cross validation, floating is another extension of SFS, not used here
           k_features=10,
           forward=True,
           floating=False,
           scoring='accuracy',
           cv=2)
sfs = sfs.fit(X_train, y_train)

selected_features = x_train.columns[list(sfs.k_feature_idx_)]

## RFE_Regression.py
from sklearn.feature_selection import RFE
lm = LinearRegression()
rfe1 = RFE(lm, 20)   # RFE with 20 features

# Fit on train and test data with 20 features
X_train_new = rfe1.fit_transform(X_train, y_train)
X_test_new = rfe1.transform(X_test)

# Print the boolean results
print(rfe1.support_)       # Output [False False False False  True False False False  True False False...]
	#USING SCIPY
	from scipy.stats import spearmanr
	from scipy.stats import pearsonr
	from scipy.stats import kendalltau

	coef, p = pearsonrr(x, y) #Pearson's r
	coef, p = spearmanr(x, y) # Spearman's rho
	coef, p = kendalltau(x, y) # Kendall's tau

	#USING PANDAS
	from sklearn.feature_selection import VarianceThreshold
	print(df_train.shape) #output (143, 59)
	var_filter = VarianceThreshold(threshold = 0.0)
	train = var_filter.fit_transform(df_train)
	#to get the count of features that are not constant
	print(train.shape()) # output (143, 56)
	#or
	print(len(df_train.columns[var_filter.get_support()])) #output 56
	from sklearn.feature_selection import SelectKBest
	from sklearn.feature_selection import chi2
	X_train = X_train.astype(int)
	chi2_features = SelectKBest(chi2 , k=12)
	X_kbest_features = chi2_features.fit_transform(X_train, y_train)
	print(df_train.isnull().sum()/len(df_train)*100).nlargest())
	#output => Returns the 5 largest values from the series. No missing values in automobile dataset, so all shows 0%.
	"""symboling 0.0
	doornumber 0.0
	wheelbase 0.0
	carlength 0.0
	carwidth 0.0
	dtype: float64"""
	from sklearn.feature_selection import mutual_info_regression
	from sklearn.feature_selection import SelectKBest
	selector = SelectKBest(mutual_info_regression, k=10)
	X_train_new = selector.fit_transform(X_train, y_train) #Applying transformation to the training set
	#to get names of the selected features
	mask = selector.get_support() # Output array([False, False, True, True, True, False ....])

	print(selector.scores_) #Output array([0.16978127, 0.01829886, 0.45461366, 0.55126343, 0.66081217, 0.27715287 ....])

	new_features = X_train.columns[mask]
	from sklearn.feature_selection import SelectKBest
	from sklearn.feature_selection import f_regression

	fvalue_selector = SelectKBest(f_regression, k=20) #select features with 20 best ANOVA F-Values
	X_train_new = fvalue_selector.fit_transform(X_train, y_train)
	print(X_train.shape, X_train_new.shape) #output (143, 59) (143, 20)
	from mlxtend.feature_selection import SequentialFeatureSelector
	sfs = SequentialFeatureSelector(LinearRegression(), # cv = k-fold cross validation, floating is another extension of SFS, not used here
	k_features=10,
	forward=True,
	floating=False,
	scoring='accuracy',
	cv=2)
	sfs = sfs.fit(X_train, y_train)

	selected_features = x_train.columns[list(sfs.k_feature_idx_)]
	from sklearn.feature_selection import RFE
	lm = LinearRegression()
	rfe1 = RFE(lm, 20) # RFE with 20 features

	# Fit on train and test data with 20 features
	X_train_new = rfe1.fit_transform(X_train, y_train)
	X_test_new = rfe1.transform(X_test)

	# Print the boolean results
	print(rfe1.support_) # Output [False False False False True False False False True False False...]