Skip to content

Instantly share code, notes, and snippets.

@devarshi16
Created July 20, 2021 16:07
Show Gist options
  • Save devarshi16/cd09e245ffaf64eaf780ab346b2d0599 to your computer and use it in GitHub Desktop.
Save devarshi16/cd09e245ffaf64eaf780ab346b2d0599 to your computer and use it in GitHub Desktop.
import os
import numpy as np
import time
import matplotlib.pyplot as plt
from tqdm import tqdm
import requests
import gzip
from numpy import loadtxt,savetxt
np.seterr(divide='ignore',invalid='ignore')
###################----DOWNLOADING THE PIMA-INDIAN-DIABETES-DATASET-----###################
url = 'https://raw.githubusercontent.com/npradaschnor/Pima-Indians-Diabetes-Dataset/master/diabetes.csv'
target = 'diabetes.csv'
response = requests.get(url,stream=True)
if response.status_code == 200:
with open(target, 'wb') as f:
f.write(response.raw.read())
else:
print("status code:",response.status_code)
print("Unable to download:",url)
##########################--------HELPER FUNCTIONS---------###########################
def scalify_min_max(np_dataframe):
minimum_array=np.amin(np_dataframe,axis=0)
maximum_array=np.amax(np_dataframe,axis=0)
range_array = maximum_array-minimum_array
scaled = (np_dataframe-minimum_array)/range_array
return scaled
def accuracy_calculator(Y_out,Y):
accuracy=np.sum(np.logical_not(np.logical_xor(Y_out,Y)))/Y.shape[0]
true_positives=np.sum(np.logical_and(Y_out,Y))
false_positives=np.sum(np.logical_and(Y_out,np.logical_not(Y)))
false_negatives=np.sum(np.logical_and(np.logical_not(Y_out),Y))
precision=true_positives/(true_positives+false_positives)
recall=true_positives/(true_positives+false_negatives)
#print("Precision:",precision,".Recall:",recall)
F1_score=precision*recall/(precision+recall)
return [accuracy,precision,recall,F1_score]
####################-----DATA PREP FUNCITONS------###############################
def pre_data_prep(filename,dest_fileloc):
with open(filename,'rb') as f:
gzip_fd=gzip.GzipFile(fileobj=f)
next(gzip_fd)
diabetes_df = loadtxt(gzip_fd,delimiter=',',dtype=np.float32)
#diabetes_df = loadtxt(filename,delimiter=',',dtype=np.float32)
Y=diabetes_df[:,-1]
scaled_diabetes_df = scalify_min_max(diabetes_df[:,:-1])
concat_diabetes = np.concatenate((scaled_diabetes_df,np.array([Y]).T),axis=1)
savetxt(dest_fileloc,concat_diabetes,delimiter=',')
def dataprep(fileloc,split):
assert len(split) == 3
assert sum(split) == 1
diabetes_data = loadtxt(fileloc,delimiter=',',dtype=np.float32)
Y=np.array([diabetes_data[:,-1]]).T
classes = np.unique(Y)
assert len(classes) == 2
X=diabetes_data[:,:-1]
data_size=X.shape[0]
print(data_size,X.shape,Y.shape)
split_size=int(split[0]*data_size)
val_split=int(split[1]*data_size)
X_train=X[:split_size]
X_val=X[split_size:split_size+val_split]
X_test=X[split_size+val_split:]
Y_train=Y[:split_size]
Y_val=Y[split_size:split_size+val_split]
Y_test=Y[split_size+val_split:]
return X_train,X_val,X_test,Y_train,Y_val,Y_test
# Evaluating the learned model
def evaluate(theta_params,X,Y=None,thresh=0.5):
data_size=X.shape[0]
X_extend=np.concatenate((np.ones((data_size,1)),X),axis=1)
pred = np.greater(np.matmul(X_extend,theta_params),thresh)*1
cost=np.sum(np.square(np.matmul(X_extend,theta_params)-Y))/(data_size*2)
return pred,cost
###############--------REGRESSION FUNCTION----------############################
def linear_regression(X,Y,learning_rate=0.001,num_iters=100,thresh=0.5,rand_seed=None):
if rand_seed!=None:
np.random.seed(rand_seed)
data_size = X.shape[0]
#print(X.shape,Y.shape)
theta_params=np.array([np.random.randn(X.shape[1]+1)]).T
X_extend = np.concatenate((np.ones((data_size,1)),X),axis=1)
cost=[]
for i in tqdm(range(num_iters),desc="Training.."):
theta_params=theta_params-learning_rate*np.matmul((np.matmul(theta_params.T,X_extend.T)-Y.T),X_extend).T/data_size
cost.append(np.sum(np.square(np.matmul(X_extend,theta_params)-Y)[0])/(data_size*2))
final_pred = np.greater(np.matmul(X_extend,theta_params),thresh)*1
accuracy=np.sum(np.logical_not(np.logical_xor(final_pred,Y)))/data_size
cost=np.array(cost)
return theta_params,accuracy,cost
###############--------LOGISTIC FUNCTION------------############################
def sigmoid_func(theta,X):
retval = 1/(1+np.exp(-1*np.matmul(theta.T,X)))
return retval
def logistic_regression(X,Y,learning_rate=0.001,num_iters=100,thresh=0.5,rand_seed=None):
if rand_seed!=None:
np.random.seed(rand_seed)
data_size = X.shape[0]
#print(X.shape,Y.shape)
theta_params=np.array([np.random.randn(X.shape[1]+1)]).T
X_extend = np.concatenate((np.ones((data_size,1)),X),axis=1).T
cost=[]
for i in tqdm(range(num_iters),desc="Training.."):
h_theta=sigmoid_func(theta_params,X_extend).T#mX1
grad=np.matmul(X_extend,(h_theta-Y))/data_size#nXm*mX1=nX1
theta_params=theta_params-learning_rate*grad
cost.append(-1*np.sum(Y*np.log(h_theta)+(1-Y)*np.log(1-h_theta))/(data_size))
final_pred = np.greater(np.matmul(X_extend.T,theta_params),thresh)*1
accuracy=np.sum(np.logical_not(np.logical_xor(final_pred,Y)))/data_size
cost=np.array(cost)
return theta_params,accuracy,cost
###############--------REGRESSION RUNNER---------###############################
def regression_runner(fileloc,data_split_ratios,seed_values):
X_train,X_val,X_test,Y_train,Y_val,Y_test = dataprep(fileloc,data_split_ratios)
all_models=[]
all_val_accuracies=[]
random_seeds=seed_values
num_iters=500
x_axis=np.arange(num_iters)
for i in range(len(random_seeds)):
model,train_accuracy,cost=linear_regression(X_train,Y_train,rand_seed=random_seeds[i],num_iters=num_iters)
print("Trial:",i,".Train Accuracy:",train_accuracy)
all_models.append(model)
plt.plot(x_axis,cost,label=str(random_seeds[i]))
val_prediction,val_cost=evaluate(model,X_val,Y_val)
accuracy_precision=accuracy_calculator(val_prediction,Y_val)
all_val_accuracies.append(accuracy_precision[0])
print("Validation Accuracy:",accuracy_precision,"Validation Cost:",val_cost)
#print("Validation Accuracy:",accuracy_precision)
#print("Validation Cost:",val_cost)
#plt.legend()
plt.title("Linear Regression")
plt.xlabel('Number of iterations')
plt.ylabel('Cost')
plt.show()
max_accuracy_idx=np.where(all_val_accuracies==np.amax(all_val_accuracies))[0][0]
best_model=all_models[max_accuracy_idx]
#print(best_model.shape)
#print(X_test.shape,Y_test.shape)
test_pred,test_cost=evaluate(best_model,X_test,Y_test)
#print(test_pred.shape,print(test_cost))
test_accuracy,test_precision,test_recall,test_f1=accuracy_calculator(test_pred,Y_test)
print("Test accuracy:",test_accuracy,".Test cost:",test_cost)
#####################-------------LOGISTIC RUNNER--------------##########################
def logistic_runner(fileloc,data_split_ratios,seed_values):
X_train,X_val,X_test,Y_train,Y_val,Y_test = dataprep(fileloc,data_split_ratios)
all_models=[]
all_val_accuracies=[]
random_seeds=seed_values
num_iters=1500
x_axis=np.arange(num_iters)
for i in range(10):
model,train_accuracy,cost=logistic_regression(X_train,Y_train,rand_seed=random_seeds[i],num_iters=num_iters)
print("Trial:",i,".Train Accuracy:",train_accuracy)
all_models.append(model)
plt.plot(x_axis,cost,label=str(random_seeds[i]))
val_prediction,val_cost=evaluate(model,X_val,Y_val)
accuracy_precision=accuracy_calculator(val_prediction,Y_val)
all_val_accuracies.append(accuracy_precision[0])
print("Validation Accuracy:",accuracy_precision,"Validation Cost:",val_cost)
#print("Validation Cost:",val_cost)
#plt.legend()
plt.title("Logistic Regression")
plt.xlabel('Number of iterations')
plt.ylabel('Cost')
plt.show()
max_accuracy_idx=np.where(all_val_accuracies==np.amax(all_val_accuracies))[0][0]
best_model=all_models[max_accuracy_idx]
test_pred,test_cost=evaluate(best_model,X_test,Y_test)
#print(test_pred.shape,print(test_cost))
test_accuracy,test_precision,test_recall,test_f1=accuracy_calculator(test_pred,Y_test)
print("Test accuracy:",test_accuracy,".Test cost:",test_cost)
if __name__ =="__main__":
fileloc='diabetes_scaled.csv'
pre_data_prep('diabetes.csv',fileloc)
data_split_ratios = [0.7,0.15,0.15]
seed_values=[12345,65432,872485,13500,198613,426713,923451,155978,18289,1050]
regression_runner(fileloc,data_split_ratios,seed_values)
seed_values=[12345,8123,872485,1350,198613,426713,923451,155978,18289,167]
logistic_runner(fileloc,data_split_ratios,seed_values)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment