This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import numpy as np | |
import sklearn | |
from sklearn.preprocessing import OneHotEncoder | |
#user defined one hot encode function | |
def one_hot_encode(df, categorical_columns): | |
# create an encoder object | |
encoder = OneHotEncoder(handle_unknown='ignore') | |
# one-hot encode each categorical column |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import argparse | |
import pandas as pd | |
import os | |
def Main(): | |
print("In the main function") | |
parser = argparse.ArgumentParser() | |
parser.add_argument("--input_folder") | |
parser.add_argument("--input_file_name") | |
parser.add_argument("--output_folder") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import utilities | |
import argparse | |
import os | |
def Main(): | |
print("In the main function") | |
parser = argparse.ArgumentParser() | |
parser.add_argument("--input_folder") | |
parser.add_argument("--input_file_name") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.model_selection import train_test_split | |
default = pd.read_csv("default.csv") | |
default['TARGET'] = np.where(default['default']=='Yes',1,0) | |
default['Is_student'] = np.where(default['student']=='Yes', 1,0) | |
#subset features | |
features = default[[ 'Is_student', 'balance', 'income']] | |
#set target | |
target = default['TARGET'] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#initialize confusion matrix | |
conf_train = np.array([[68,14],[3,35]]) | |
TN = conf_train[0,0] | |
FP = conf_train[0,1] | |
FN = conf_train[1,0] | |
TP = conf_train[1,1] | |
#Type 1 error, 1-specificity = percentage of true label incorrectly identified | |
print('False Positive Rate: ', (FP/(TN+FP)).round(3)) | |
#1-Type 2 error, power, sensitivity, recall = percentage of true label correctly identified |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.metrics import confusion_matrix | |
#load the iris dataset | |
iris = pd.read_csv("iris_csv.csv") | |
#create the target feature for a binary class classification problem | |
iris['target'] = np.where(iris['class']=='Iris-setosa', 1, 0) | |
#split train test data | |
train = iris.sample(frac = 0.8,replace=False) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import statsmodels.stats.proportion | |
from statsmodels.stats.proportion import proportions_ztest | |
#build counter | |
df_test['Count'] = 1 | |
df_train['Count'] = 1 | |
#Get proportion for test data | |
df_test_Person = df_test.groupby(['Person_drift'], dropna = False, as_index=False).agg({'Count': np.sum,'Target_drift': np.sum}) | |
df_test_Person['Ratio'] = (df_test_Person['Target_drift']/df_test_Person['Count']).round(3) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from scipy.stats import ks_2samp | |
df_train = df_train.astype({'BP_drift': 'float', 'BP_no_drift': 'float'}) | |
df_test = df_test.astype({'BP_drift': 'float', 'BP_no_drift': 'float'}) | |
df_train = df_train.astype({'Target_drift': 'float', 'Target_no_drift': 'float'}) | |
df_test = df_test.astype({'Target_drift': 'float', 'Target_no_drift': 'float'}) | |
#perform Kolmogorov-Smirnov test | |
ks_2samp(df_train['BP_no_drift'], df_test['BP_drift']) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import pandas as pd | |
np.random.seed(42) | |
train_data = np.concatenate([np.random.choice(a=[0, 1], p=[0.85, 0.15], size=(1000, 2)), | |
np.random.randint(110, 150, size=(1000,2)), | |
np.random.choice(a=['artist', 'cyclist', 'batsman'], p=[0.4, 0.1, 0.5], size=(1000, 2))], axis=1) | |
test_data = np.concatenate([ | |
np.random.choice(a=[0, 1], p=[0.85, 0.15], size=(1000, 1)), |