Created
November 14, 2017 08:27
-
-
Save duarteocarmo/84b7164f6532f5101f7380e3a5e86982 to your computer and use it in GitHub Desktop.
Classification - Logistic Regression
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
""" | |
Created on Sun Nov 12 09:54:58 2017 | |
@author: sviglios | |
""" | |
from Project_Clean_data import raw, header, is_binary | |
from sklearn import cross_validation, tree | |
import sklearn.linear_model as lm | |
from sklearn.metrics import confusion_matrix | |
from Project_Clean_data import raw, header, is_binary | |
from matplotlib.pyplot import figure, plot, subplot, title, xlabel, ylabel, show, clim, ion, legend, boxplot, savefig, imshow, colorbar, xticks, yticks, ylim | |
import matplotlib.pyplot as plt | |
import numpy as np | |
import math | |
# import chest | |
X_chest = np.loadtxt('chest.txt', dtype=int) | |
final_cand = np.loadtxt('final_cand.txt', dtype=int) | |
# select attribute to predict | |
target_attribute_name = 'Dx' | |
target_index = list(header).index(target_attribute_name) | |
# prepare data | |
X = raw | |
y = X[:, target_index] | |
y = np.delete(y, final_cand) | |
X = np.delete(raw, target_index, 1) | |
X = np.delete(X, final_cand, 0) | |
attributeNames = np.delete(header, target_index) | |
N, M = X.shape | |
C = 2 | |
# K-fold crossvalidation | |
K = 10 | |
CV = cross_validation.KFold(N,K,shuffle=True) | |
k=0 | |
Error_train = np.empty(K) | |
Error_test = np.empty(K) | |
err_train = [] | |
err_test = [] | |
models = [] | |
for train_index, test_index in CV: | |
print('Computing CV fold: {0}/{1}..'.format(k+1,K)) | |
# extract training and test set for current CV fold | |
X_train, y_train = X[train_index,:], y[train_index] | |
X_test, y_test = X[test_index,:], y[test_index] | |
#y_train = y_train.squeeze() | |
#y_test = y_test.squeeze() | |
# Fit logistic regression model | |
model = lm.logistic.LogisticRegression() | |
model = model.fit(X_train,y_train) | |
y_est_train = model.predict(X_train) | |
y_est_test = model.predict(X_test) | |
#y_est_train_neg = model.predict_proba(X_train)[:, 0] | |
#y_est_test_neg = model.predict_proba(X_test)[:, 0] | |
# Evaluate misclassification rate over train/test data (in this CV fold) | |
misclass_rate_test = sum(np.abs(y_est_test - y_test)) / float(len(y_est_test)) | |
misclass_rate_train = sum(np.abs(y_est_train - y_train)) / float(len(y_est_train)) | |
#Error_test[k], Error_train[k] = misclass_rate_test, misclass_rate_train | |
models.append(model) | |
err_test.append(misclass_rate_test) | |
err_train.append(misclass_rate_train) | |
k+=1 | |
# final test | |
y_chest = X_chest[:, target_index] | |
X_chest = np.delete(X_chest, target_index, 1) | |
index = err_test.index(min(err_test)) | |
last_model = models[index] | |
f_y_est_chest = last_model.predict(X_chest) | |
f_misclass_rate_chest = sum(np.abs(f_y_est_chest - y_chest)) / float(len(f_y_est_chest)) | |
accuracy = 1 - f_misclass_rate_chest | |
print(f_misclass_rate_chest) | |
f_y_est_chest_neg_prob = last_model.predict_proba(X_chest)[:, 0] | |
f = figure(); f.hold(True) | |
class0_ids = np.nonzero(y_chest==0)[0].tolist() | |
plot(class0_ids, f_y_est_chest_neg_prob[class0_ids], '.y') | |
class1_ids = np.nonzero(y_chest==1)[0].tolist() | |
plot(class1_ids, f_y_est_chest_neg_prob[class1_ids], '.r') | |
xlabel('Data object (patient)'); ylabel('Predicted prob. of negative diagnosis'); | |
legend(['Negative diagnosis', 'Positive diagnosis']) | |
ylim(-0.01,1.5) | |
savefig("scatter_log_regression.png") | |
show() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment