This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# import the required library | |
from sklearn.feature_selection import VarianceThreshold | |
# define the transform. | |
# Default threshold parameter is 0, can be set to any float value. All features with variance below this threshold will be removed | |
selector = VarianceThreshold(threshold = 0) | |
# fit the defined transform on training dataset's numerical columns | |
selector.fit(X_train.select_dtypes(include = [np.number])) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Import Z-score function | |
from scipy.stats import zscore | |
# Define the SD threshold | |
thresh = 3 | |
# List of all rows as `True` or `False` depending on if they have a value above the threshold or not | |
SD_outliers = X_train.apply(lambda x: np.abs(zscore(x, nan_policy = 'omit')) > thresh).any(axis=1) | |
# Drop (inplace) rows that have True in SD_Norm |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Calculate 1st and 3rd percentiles, and IQR | |
Q1 = X_train.quantile(0.25) | |
Q3 = X_train.quantile(0.75) | |
IQR = Q3 - Q1 | |
# Filter out the rows that fall outside the 1.5 threshold in each column | |
X_train_new = X_train[~((X_train < (Q1 - 1.5 * IQR)) | (X_train > (Q3 + 1.5 * IQR))).any(axis=1)] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Import the required library | |
from sklearn.neighbors import LocalOutlierFactor | |
# define LOF class | |
lof = LocalOutlierFactor() # consider playing around with 'n_neighbors' parameter | |
# predict whether the numerical columns are outlier or not | |
yhat = lof.fit_predict(X_train) | |
# select all rows that are not outliers |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# import the required library | |
from sklearn.feature_selection import VarianceThreshold | |
# define the transform | |
# default threshold parameter is 0, can be set to any float value. All features with variance below this threshold will be removed | |
selector = VarianceThreshold(threshold = 0) | |
# fit the defined transform on training dataset | |
selector.fit(X_train) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# import the required library | |
from sklearn.impute import SimpleImputer | |
# define imputer | |
imputer = SimpleImputer(strategy='mean') # other options include: median, most_frequent, constant | |
# fit on the training dataset | |
imputer.fit(X_train) | |
# transform the training dataset (don't forget to transform the test dataset as well) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# import required libraries | |
from sklearn.ensemble import RandomForestClassifier # can be any classifier of your choice | |
from sklearn.impute import SimpleImputer | |
from sklearn.model_selection import cross_val_score | |
from sklearn.model_selection import RepeatedStratifiedKFold | |
from sklearn.pipeline import Pipeline | |
# define modeling pipeline | |
model = RandomForestClassifier() # can be any model that you want to use | |
imputer = SimpleImputer(strategy='mean') #other allowed imputation strategies can also be used |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# import required libraries | |
from sklearn.ensemble import RandomForestClassifier # can be any classifier of your choice | |
from sklearn.impute import SimpleImputer | |
from sklearn.model_selection import cross_val_score | |
from sklearn.model_selection import RepeatedStratifiedKFold | |
from sklearn.pipeline import Pipeline | |
# define a list of all strategies to be evaluated | |
strategies = ['mean', 'median', 'most_frequent', 'constant'] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# import the required library | |
from sklearn.impute import KNNImputer | |
# define imputer | |
imputer = KNNImputer() # you might want to try different n_neighbors parameters. Default is 5 | |
# fit on the training dataset | |
imputer.fit(X_train) | |
# transform the training dataset (don't forget to transform the test dataset as well) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# import required libraries | |
from sklearn.ensemble import RandomForestClassifier # can be any classifier of your choice | |
from sklearn.impute import KNNImputer | |
from sklearn.model_selection import cross_val_score | |
from sklearn.model_selection import RepeatedStratifiedKFold | |
from sklearn.pipeline import Pipeline | |
# define modeling pipeline | |
model = RandomForestClassifier() # can be any model that you want to use | |
imputer = KNNImputer() |
OlderNewer