Skip to content

Instantly share code, notes, and snippets.

@duarteocarmo
Created November 14, 2017 08:27
Show Gist options
  • Save duarteocarmo/84b7164f6532f5101f7380e3a5e86982 to your computer and use it in GitHub Desktop.
Save duarteocarmo/84b7164f6532f5101f7380e3a5e86982 to your computer and use it in GitHub Desktop.
Classification - Logistic Regression
# -*- coding: utf-8 -*-
"""
Created on Sun Nov 12 09:54:58 2017
@author: sviglios
"""
from Project_Clean_data import raw, header, is_binary
from sklearn import cross_validation, tree
import sklearn.linear_model as lm
from sklearn.metrics import confusion_matrix
from Project_Clean_data import raw, header, is_binary
from matplotlib.pyplot import figure, plot, subplot, title, xlabel, ylabel, show, clim, ion, legend, boxplot, savefig, imshow, colorbar, xticks, yticks, ylim
import matplotlib.pyplot as plt
import numpy as np
import math
# import chest
X_chest = np.loadtxt('chest.txt', dtype=int)
final_cand = np.loadtxt('final_cand.txt', dtype=int)
# select attribute to predict
target_attribute_name = 'Dx'
target_index = list(header).index(target_attribute_name)
# prepare data
X = raw
y = X[:, target_index]
y = np.delete(y, final_cand)
X = np.delete(raw, target_index, 1)
X = np.delete(X, final_cand, 0)
attributeNames = np.delete(header, target_index)
N, M = X.shape
C = 2
# K-fold crossvalidation
K = 10
CV = cross_validation.KFold(N,K,shuffle=True)
k=0
Error_train = np.empty(K)
Error_test = np.empty(K)
err_train = []
err_test = []
models = []
for train_index, test_index in CV:
print('Computing CV fold: {0}/{1}..'.format(k+1,K))
# extract training and test set for current CV fold
X_train, y_train = X[train_index,:], y[train_index]
X_test, y_test = X[test_index,:], y[test_index]
#y_train = y_train.squeeze()
#y_test = y_test.squeeze()
# Fit logistic regression model
model = lm.logistic.LogisticRegression()
model = model.fit(X_train,y_train)
y_est_train = model.predict(X_train)
y_est_test = model.predict(X_test)
#y_est_train_neg = model.predict_proba(X_train)[:, 0]
#y_est_test_neg = model.predict_proba(X_test)[:, 0]
# Evaluate misclassification rate over train/test data (in this CV fold)
misclass_rate_test = sum(np.abs(y_est_test - y_test)) / float(len(y_est_test))
misclass_rate_train = sum(np.abs(y_est_train - y_train)) / float(len(y_est_train))
#Error_test[k], Error_train[k] = misclass_rate_test, misclass_rate_train
models.append(model)
err_test.append(misclass_rate_test)
err_train.append(misclass_rate_train)
k+=1
# final test
y_chest = X_chest[:, target_index]
X_chest = np.delete(X_chest, target_index, 1)
index = err_test.index(min(err_test))
last_model = models[index]
f_y_est_chest = last_model.predict(X_chest)
f_misclass_rate_chest = sum(np.abs(f_y_est_chest - y_chest)) / float(len(f_y_est_chest))
accuracy = 1 - f_misclass_rate_chest
print(f_misclass_rate_chest)
f_y_est_chest_neg_prob = last_model.predict_proba(X_chest)[:, 0]
f = figure(); f.hold(True)
class0_ids = np.nonzero(y_chest==0)[0].tolist()
plot(class0_ids, f_y_est_chest_neg_prob[class0_ids], '.y')
class1_ids = np.nonzero(y_chest==1)[0].tolist()
plot(class1_ids, f_y_est_chest_neg_prob[class1_ids], '.r')
xlabel('Data object (patient)'); ylabel('Predicted prob. of negative diagnosis');
legend(['Negative diagnosis', 'Positive diagnosis'])
ylim(-0.01,1.5)
savefig("scatter_log_regression.png")
show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment