Skip to content

Instantly share code, notes, and snippets.

@gavinwhyte
Last active October 20, 2015 21:10
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save gavinwhyte/770349db0e8c49f215df to your computer and use it in GitHub Desktop.
Save gavinwhyte/770349db0e8c49f215df to your computer and use it in GitHub Desktop.
Rocks vs mines Machine Learning Classification
__author__ = 'gavinwhyte'
import urllib2
import numpy
import random
from sklearn import datasets, linear_model
from sklearn.metrics import roc_curve, auc
import pylab as pl
def confusionMatrix(predicted, actual, threshold):
if len(predicted) != len(actual): return -1
tp = 0.0
fp = 0.0
tn = 0.0
fn = 0.0
for i in range(len(actual)):
if actual[i] > 0.5: # labels that are 1.0 (positive examples)
if predicted[i] > threshold:
tp += 1.0 #correctly predicted positive
else:
fn += 1.0 # incorrectly predicted negative
else: #labels that are 0.0 (negative example)
if predicted[i] < threshold:
tn += 1.0
else:
fp += 1.0
rtn = [tp, fn, fp, tn]
return rtn
# read in the rocks versus mines data set from uci.edu data repository
target_url = ("https://archive.ics.uci.edu/ml/machine-learning-"
"databases/undocumented/connectionist-bench/sonar/sonar.all-data")
data = urllib2.urlopen(target_url)
xList=[]
labels =[]
for line in data:
#split on comma
row = line.strip().split(",")
# print row
#assign label 1.0 for "M" and 0.0 for "R"
if (row[-1] == "M"):
labels.append(1.0)
else:
labels.append(0.0)
#remove label from row
row.pop()
#convert row to floats
floatrow = [float(num) for num in row]
xList.append(floatrow)
# divide attribute matrix and label vector into training (2/3 of data)
# and test sets (1/3 of data)
indices = range(len(xList))
print(indices)
xListTest = [xList[i] for i in indices if i%3 == 0 ]
xListTrain = [xList[i] for i in indices if i%3 != 0]
labelsTest = [labels[i] for i in indices if i%3 == 0]
labelsTrain = [labels[i] for i in indices if i%3 != 0]
#from list of imput into numpy arrays to match input class
#for scikit-learn linear model
xTrain = numpy.array(xListTrain)
yTrain = numpy.array(labelsTrain)
xTest = numpy.array(xListTest)
yTest = numpy.array(labelsTest)
#check the shapes to see what they look like
print("Shape of xTrain array", xTrain.shape)
print("Shape of yTrain array", yTrain.shape)
print("Shape of xTest array", xTest.shape)
print("Shape of yTest array", yTest.shape)
#train linear regression model
rocksVMinesModel = linear_model.LinearRegression()
rocksVMinesModel.fit(xTrain,yTrain)
#generate predictions on in-sample error
trainingPredictions = rocksVMinesModel.predict(xTrain)
#print("Some values predicted by model",
# trainingPredictions[0:5],
# trainingPredictions[-6:-1])
print (trainingPredictions)
print("Some values predicted by model",
trainingPredictions[0:12],
trainingPredictions[-7:-1])
#generate confusion matrix for predictions on training set (in sample data)
confusionMatTrain = confusionMatrix(trainingPredictions, yTrain, 0.5)
#pick the threshold value and generate confusion matrix entries
tp = confusionMatTrain[0]; fn = confusionMatTrain[1]
fp = confusionMatTrain[2]; tn = confusionMatTrain[3]
print("tp = " + str(tp) + "\tfn = " + str(fn) + "\n" + "fp = " +
str(fp) + "\ttn = " + str(tn) + '\n' )
#generate predicitions on out-sample data
testPredictions = rocksVMinesModel.predict(xTest)
#generate confusion matrix from predicitions on out-of-sample data
conMatTest = confusionMatrix(testPredictions, yTest, 0.5)
#pick threshold value and generate confusion matrix entries
tp = conMatTest[0]; fn = conMatTest[1]
fp = conMatTest[2]; tn = conMatTest[3]
print("tp = " + str(tp) + "\tfn = " + str(fn) + "\n" + "fp = " +
str(fp) + "\ttn = " + str(tn) + '\n')
#generate ROC curve for in-sample
fpr, tpr, thresholds = roc_curve(yTrain,trainingPredictions)
roc_auc = auc(fpr, tpr)
print('Auc for in-sample ROC Curve: %f' % roc_auc)
# Plot ROC Curve
pl.clf()
pl.plot(fpr, tpr, label='ROC curve(area = %0.2f' % roc_auc)
pl.plot ([0,1], [0,1], 'k-')
pl.xlim([0.0, 1.0])
pl.ylim([0.0, 1.0])
pl.xlabel('False positive Rate')
pl.ylabel('True Positive Rate')
pl.title('In sample ROC rocks versus mines')
pl.legend(loc="lower right")
pl.show()
#generate ROC curve for Out-of-sample
fpr, tpr, thresholds = roc_curve(yTest,testPredictions)
roc_auc = auc(fpr, tpr)
print('Auc for in-sample ROC Curve: %f' % roc_auc)
# Plot ROC Curve
pl.clf()
pl.plot(fpr, tpr, label='ROC curve(area = %0.2f' % roc_auc)
pl.plot ([0,1], [0,1], 'k-')
pl.xlim([0.0, 1.0])
pl.ylim([0.0, 1.0])
pl.xlabel('False positive Rate')
pl.ylabel('True Positive Rate')
pl.title('Out-of-sample ROC rocks versus mines')
pl.legend(loc="lower right")
pl.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment