Asad Mumtaz finlytics-hub

## variancethreshold_string.py
# import the required library
from sklearn.feature_selection import VarianceThreshold

# define the transform.
# Default threshold parameter is 0, can be set to any float value. All features with variance below this threshold will be removed
selector = VarianceThreshold(threshold = 0)

# fit the defined transform on training dataset's numerical columns
selector.fit(X_train.select_dtypes(include = [np.number]))

## Zscore_outliers.py
# Import Z-score function
from scipy.stats import zscore

# Define the SD threshold
thresh = 3

# List of all rows as `True` or `False` depending on if they have a value above the threshold or not
SD_outliers = X_train.apply(lambda x: np.abs(zscore(x, nan_policy = 'omit')) > thresh).any(axis=1)

# Drop (inplace) rows that have True in SD_Norm

## IQR_outliers.py
# Calculate 1st and 3rd percentiles, and IQR
Q1 = X_train.quantile(0.25)
Q3 = X_train.quantile(0.75)
IQR = Q3 - Q1

# Filter out the rows that fall outside the 1.5 threshold in each column
X_train_new = X_train[~((X_train < (Q1 - 1.5 * IQR)) | (X_train > (Q3 + 1.5 * IQR))).any(axis=1)]

## LOF_outliers.py
# Import the required library
from sklearn.neighbors import LocalOutlierFactor

# define LOF class
lof = LocalOutlierFactor() # consider playing around with 'n_neighbors' parameter

# predict whether the numerical columns are outlier or not
yhat = lof.fit_predict(X_train)

# select all rows that are not outliers

## variancethreshold.py
# import the required library
from sklearn.feature_selection import VarianceThreshold

# define the transform
# default threshold parameter is 0, can be set to any float value. All features with variance below this threshold will be removed
selector = VarianceThreshold(threshold = 0)

# fit the defined transform on training dataset
selector.fit(X_train)

## simpleimputer_basic.py
# import the required library
from sklearn.impute import SimpleImputer

# define imputer
imputer = SimpleImputer(strategy='mean') # other options include: median, most_frequent, constant

# fit on the training dataset
imputer.fit(X_train)

# transform the training dataset (don't forget to transform the test dataset as well)

## simpleimputer_pipeline.py
# import required libraries
from sklearn.ensemble import RandomForestClassifier # can be any classifier of your choice
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.pipeline import Pipeline

# define modeling pipeline
model = RandomForestClassifier() # can be any model that you want to use
imputer = SimpleImputer(strategy='mean') #other allowed imputation strategies can also be used

## simpleimputer_pipeline_for.py
# import required libraries
from sklearn.ensemble import RandomForestClassifier # can be any classifier of your choice
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.pipeline import Pipeline

# define a list of all strategies to be evaluated
strategies = ['mean', 'median', 'most_frequent', 'constant']

## KNNImputer_basic.py
# import the required library
from sklearn.impute import KNNImputer

# define imputer
imputer = KNNImputer() # you might want to try different n_neighbors parameters. Default is 5

# fit on the training dataset
imputer.fit(X_train)

# transform the training dataset (don't forget to transform the test dataset as well)

## KNNImputer_pipeline.py
# import required libraries
from sklearn.ensemble import RandomForestClassifier # can be any classifier of your choice
from sklearn.impute import KNNImputer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.pipeline import Pipeline

# define modeling pipeline
model = RandomForestClassifier() # can be any model that you want to use
imputer = KNNImputer()
	# import the required library
	from sklearn.feature_selection import VarianceThreshold

	# define the transform.
	# Default threshold parameter is 0, can be set to any float value. All features with variance below this threshold will be removed
	selector = VarianceThreshold(threshold = 0)

	# fit the defined transform on training dataset's numerical columns
	selector.fit(X_train.select_dtypes(include = [np.number]))
	# Import Z-score function
	from scipy.stats import zscore

	# Define the SD threshold
	thresh = 3

	# List of all rows as `True` or `False` depending on if they have a value above the threshold or not
	SD_outliers = X_train.apply(lambda x: np.abs(zscore(x, nan_policy = 'omit')) > thresh).any(axis=1)

	# Drop (inplace) rows that have True in SD_Norm
	# Calculate 1st and 3rd percentiles, and IQR
	Q1 = X_train.quantile(0.25)
	Q3 = X_train.quantile(0.75)
	IQR = Q3 - Q1

	# Filter out the rows that fall outside the 1.5 threshold in each column
	X_train_new = X_train[~((X_train < (Q1 - 1.5 * IQR)) \| (X_train > (Q3 + 1.5 * IQR))).any(axis=1)]
	# Import the required library
	from sklearn.neighbors import LocalOutlierFactor

	# define LOF class
	lof = LocalOutlierFactor() # consider playing around with 'n_neighbors' parameter

	# predict whether the numerical columns are outlier or not
	yhat = lof.fit_predict(X_train)

	# select all rows that are not outliers
	# import the required library
	from sklearn.feature_selection import VarianceThreshold

	# define the transform
	# default threshold parameter is 0, can be set to any float value. All features with variance below this threshold will be removed
	selector = VarianceThreshold(threshold = 0)

	# fit the defined transform on training dataset
	selector.fit(X_train)
	# import the required library
	from sklearn.impute import SimpleImputer

	# define imputer
	imputer = SimpleImputer(strategy='mean') # other options include: median, most_frequent, constant

	# fit on the training dataset
	imputer.fit(X_train)

	# transform the training dataset (don't forget to transform the test dataset as well)
	# import required libraries
	from sklearn.ensemble import RandomForestClassifier # can be any classifier of your choice
	from sklearn.impute import SimpleImputer
	from sklearn.model_selection import cross_val_score
	from sklearn.model_selection import RepeatedStratifiedKFold
	from sklearn.pipeline import Pipeline

	# define modeling pipeline
	model = RandomForestClassifier() # can be any model that you want to use
	imputer = SimpleImputer(strategy='mean') #other allowed imputation strategies can also be used
	# import the required library
	from sklearn.impute import KNNImputer

	# define imputer
	imputer = KNNImputer() # you might want to try different n_neighbors parameters. Default is 5

	# fit on the training dataset
	imputer.fit(X_train)

	# transform the training dataset (don't forget to transform the test dataset as well)