Skip to content

Instantly share code, notes, and snippets.

View SamarDeen's full-sized avatar

Samar Deen SamarDeen

View GitHub Profile
@SamarDeen
SamarDeen / utilities.py
Created September 5, 2023 15:24
The utilities function
import pandas as pd
import numpy as np
import sklearn
from sklearn.preprocessing import OneHotEncoder
#user defined one hot encode function
def one_hot_encode(df, categorical_columns):
# create an encoder object
encoder = OneHotEncoder(handle_unknown='ignore')
# one-hot encode each categorical column
@SamarDeen
SamarDeen / simple_main_function.py
Created September 5, 2023 13:55
The main function (simple)
import argparse
import pandas as pd
import os
def Main():
print("In the main function")
parser = argparse.ArgumentParser()
parser.add_argument("--input_folder")
parser.add_argument("--input_file_name")
parser.add_argument("--output_folder")
@SamarDeen
SamarDeen / main_function.py
Created September 5, 2023 13:39
The main function
import pandas as pd
import utilities
import argparse
import os
def Main():
print("In the main function")
parser = argparse.ArgumentParser()
parser.add_argument("--input_folder")
parser.add_argument("--input_file_name")
from sklearn.model_selection import train_test_split
default = pd.read_csv("default.csv")
default['TARGET'] = np.where(default['default']=='Yes',1,0)
default['Is_student'] = np.where(default['student']=='Yes', 1,0)
#subset features
features = default[[ 'Is_student', 'balance', 'income']]
#set target
target = default['TARGET']
#initialize confusion matrix
conf_train = np.array([[68,14],[3,35]])
TN = conf_train[0,0]
FP = conf_train[0,1]
FN = conf_train[1,0]
TP = conf_train[1,1]
#Type 1 error, 1-specificity = percentage of true label incorrectly identified
print('False Positive Rate: ', (FP/(TN+FP)).round(3))
#1-Type 2 error, power, sensitivity, recall = percentage of true label correctly identified
from sklearn.metrics import confusion_matrix
#load the iris dataset
iris = pd.read_csv("iris_csv.csv")
#create the target feature for a binary class classification problem
iris['target'] = np.where(iris['class']=='Iris-setosa', 1, 0)
#split train test data
train = iris.sample(frac = 0.8,replace=False)
import statsmodels.stats.proportion
from statsmodels.stats.proportion import proportions_ztest
#build counter
df_test['Count'] = 1
df_train['Count'] = 1
#Get proportion for test data
df_test_Person = df_test.groupby(['Person_drift'], dropna = False, as_index=False).agg({'Count': np.sum,'Target_drift': np.sum})
df_test_Person['Ratio'] = (df_test_Person['Target_drift']/df_test_Person['Count']).round(3)
from scipy.stats import ks_2samp
df_train = df_train.astype({'BP_drift': 'float', 'BP_no_drift': 'float'})
df_test = df_test.astype({'BP_drift': 'float', 'BP_no_drift': 'float'})
df_train = df_train.astype({'Target_drift': 'float', 'Target_no_drift': 'float'})
df_test = df_test.astype({'Target_drift': 'float', 'Target_no_drift': 'float'})
#perform Kolmogorov-Smirnov test
ks_2samp(df_train['BP_no_drift'], df_test['BP_drift'])
import numpy as np
import pandas as pd
np.random.seed(42)
train_data = np.concatenate([np.random.choice(a=[0, 1], p=[0.85, 0.15], size=(1000, 2)),
np.random.randint(110, 150, size=(1000,2)),
np.random.choice(a=['artist', 'cyclist', 'batsman'], p=[0.4, 0.1, 0.5], size=(1000, 2))], axis=1)
test_data = np.concatenate([
np.random.choice(a=[0, 1], p=[0.85, 0.15], size=(1000, 1)),