Created
June 3, 2017 06:13
-
-
Save teknomo/7b87e75426f25eb67256cf2d4724ce5a to your computer and use it in GitHub Desktop.
SVM in Python
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding: utf-8 | |
# # SVM in Python | |
# | |
# by *Kardi Teknomo* | |
# | |
# last update: June 2017 | |
# This is a practice/laboratory session of SVM tutorial using Python. | |
# | |
# First, you need to import the necessary modules. | |
# In[1]: | |
import numpy as np | |
import pandas as pd | |
import matplotlib.pyplot as plt | |
from sklearn import svm | |
import sklearn.metrics as metric | |
get_ipython().magic('matplotlib inline') | |
# # Linearly Separable Dataset | |
# | |
# Our first dataset can be uploaded. Put the CSV files in the same folder as the Jupyter notebook. | |
# ## Data | |
# In[2]: | |
filedata='SVM_Dataset1.csv' | |
data1=pd.read_csv(filedata) | |
data1 | |
# We separate the X training data from the y training data | |
# In[3]: | |
X1=data1['X1'] | |
X2=data1['X2'] | |
X_training=np.array(list(zip(X1,X2))) | |
X_training | |
# In[4]: | |
y_training=data1['y'] | |
y_training | |
# Let us plot this data. Can you imagine a line separating the two classes? | |
# In[5]: | |
idxPlus=y_training[y_training<0].index | |
idxMin=y_training[y_training>0].index | |
plt.scatter(X_training[idxPlus,0],X_training[idxPlus,1],c='b',s=50) | |
plt.scatter(X_training[idxMin,0],X_training[idxMin,1],c='r',s=50) | |
plt.legend(['-1','+1'],loc=2) | |
plt.xlabel('X1') | |
plt.ylabel('X2'); | |
plt.savefig('chart0.png') | |
# ## SVM Training | |
# To use Vector Support Classification (SVC) algorithm, we need define the model **kernel**. Let us use *linear kernel*. Then, we use the **fit()** function to train the model with our training data. | |
# | |
# In[6]: | |
svc = svm.SVC(kernel='linear').fit(X_training,y_training) | |
svc | |
# In[7]: | |
target_names=['-1','+1'] | |
target_names | |
# The trained model can be plotted with specifying the decision_function() method. | |
# In[8]: | |
idxPlus=y_training[y_training<0].index | |
idxMin=y_training[y_training>0].index | |
plt.scatter(X_training[idxPlus,0],X_training[idxPlus,1],c='b',s=50) | |
plt.scatter(X_training[idxMin,0],X_training[idxMin,1],c='r',s=50) | |
plt.legend(target_names,loc=2) | |
X,Y = np.mgrid[1:4:100j,2:8:100j] | |
Z = svc.decision_function(np.c_[X.ravel(),Y.ravel()]) | |
Z = Z.reshape(X.shape) | |
plt.contourf(X,Y,Z > 0,alpha=0.4) | |
plt.contour(X,Y,Z,colors=['k'], linestyles=['-'],levels=[0]) | |
plt.title('Linearly Separable') | |
plt.savefig('chart1.png') | |
# The following plot show the margin and the support vectors | |
# In[9]: | |
idxPlus=y_training[y_training<0].index | |
idxMin=y_training[y_training>0].index | |
plt.scatter(X_training[idxPlus,0],X_training[idxPlus,1],c='b',s=50) | |
plt.scatter(X_training[idxMin,0],X_training[idxMin,1],c='r',s=50) | |
plt.legend(target_names,loc=2) | |
X,Y = np.mgrid[1:4:100j,2:8:100j] | |
Z = svc.decision_function(np.c_[X.ravel(),Y.ravel()]) | |
Z = Z.reshape(X.shape) | |
plt.contourf(X,Y,Z > 0,alpha=0.4) | |
plt.contour(X,Y,Z,colors=['k','k','k'], linestyles=['--','-','--'],levels=[-1,0,1]) | |
plt.scatter(svc.support_vectors_[:,0],svc.support_vectors_[:,1],s=120,facecolors='none') | |
plt.scatter(X_training[:,0],X_training[:,1],c=y_training,s=50,alpha=0.95); | |
plt.title('Margin and Support Vectors') | |
plt.savefig('chart2.png') | |
# ## Training Performances | |
# To get the normalize accuracy, of the training, we can use **score(X,y)** function. | |
# In[10]: | |
svc.score(X_training, y_training) | |
# Alternatively, if you have test sample, you can also use the metric from sklearn. To use this on the training sample, we first need to define the y-prediction (which is based on the prediction of the model with X comes from the training sample) and the y-true value (which is based on the y of the training sample). | |
# In[11]: | |
y_pred=svc.predict(X_training) | |
y_pred | |
# In[12]: | |
y_true = y_training | |
y_true | |
# The absolute accuracy is measured as follow. | |
# In[13]: | |
metric.accuracy_score(y_true, y_pred, normalize=False) | |
# Confusion matrix is useful to see if there is misclassification. If there is no missclassification, then the corect values would be in the diagonal. | |
# In[14]: | |
cnf_matrix=metric.confusion_matrix(y_true, y_pred) | |
cnf_matrix | |
# We can also visualize the confusion matrix through the following function | |
# In[15]: | |
import itertools | |
def plot_confusion_matrix(cm, classes, | |
normalize=False, | |
title='Confusion matrix', | |
cmap=plt.cm.Blues): | |
""" | |
This function prints and plots the confusion matrix. | |
Normalization can be applied by setting `normalize=True`. | |
""" | |
plt.imshow(cm, interpolation='nearest', cmap=cmap) | |
plt.title(title) | |
plt.colorbar() | |
tick_marks = np.arange(len(classes)) | |
plt.xticks(tick_marks, classes, rotation=45) | |
plt.yticks(tick_marks, classes) | |
if normalize: | |
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] | |
print("Normalized confusion matrix") | |
else: | |
print('Confusion matrix, without normalization') | |
print(cm) | |
thresh = cm.max() / 2. | |
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])): | |
plt.text(j, i, cm[i, j], | |
horizontalalignment="center", | |
color="white" if cm[i, j] > thresh else "black") | |
plt.tight_layout() | |
plt.ylabel('True label') | |
plt.xlabel('Predicted label') | |
plt.figure() | |
plot_confusion_matrix(cnf_matrix, classes=target_names, normalize=False) | |
plt.show() | |
# ### SVM Prediction | |
# | |
# Now we can also use the trained SVM to predict something that is outside the training data. Let us predict the class y of the given test data [X1, X2] = [3, 6] | |
# In[16]: | |
svc.predict([[3,6]]) | |
# The test data is now plotted. | |
# In[17]: | |
idxPlus=y_training[y_training<0].index | |
idxMin=y_training[y_training>0].index | |
plt.scatter(X_training[idxPlus,0],X_training[idxPlus,1],c='b',s=50) | |
plt.scatter(X_training[idxMin,0],X_training[idxMin,1],c='r',s=50) | |
plt.scatter(3,6,c='r',marker='s',s=90) | |
plt.legend(['-1','+1','prediction'],loc=2) | |
X,Y = np.mgrid[1:4:100j,2:8:100j] | |
Z = svc.decision_function(np.c_[X.ravel(),Y.ravel()]) | |
Z = Z.reshape(X.shape) | |
plt.contourf(X,Y,Z > 0,alpha=0.4) | |
plt.contour(X,Y,Z,colors=['k'], linestyles=['-'],levels=[0]) | |
plt.title('Prediction') | |
plt.savefig('chart3.png') | |
# In[18]: | |
del X1, X2, X_training, y_training | |
# # Non Linearly Separable Dataset | |
# | |
# ## Data | |
# Now we upload the second dataset. | |
# In[19]: | |
filedata='SVM_Dataset2.csv' | |
data2=pd.read_csv(filedata) | |
data2 | |
# We separate the X training data from the y training data | |
# In[20]: | |
X1=data2['x1'] | |
X2=data2['x2'] | |
X_training=np.array(list(zip(X1,X2))) | |
X_training | |
# In[21]: | |
y_training=data2['y'] | |
y_training | |
# Let us plot this data. Can you imagine a line separating the two classes? | |
# In[22]: | |
idxPlus=y_training[y_training<0].index | |
idxMin=y_training[y_training>0].index | |
plt.scatter(X_training[idxPlus,0],X_training[idxPlus,1],c='b',s=50) | |
plt.scatter(X_training[idxMin,0],X_training[idxMin,1],c='r',s=50) | |
plt.legend(target_names,loc=3) | |
plt.xlabel('X1') | |
plt.ylabel('X2'); | |
plt.savefig('nl-chart0.png') | |
# ## Training | |
# Since the plot show that no line can separate the two classes. If we still want to use linear kernel, we can define a regularization cost parameter **C**. We use the **fit()** function to train the model with our training data. Feel free to change the regularization parameter to make such that the error of classification would be minimum. | |
# In[23]: | |
svc = svm.SVC(kernel='linear',C=1).fit(X_training,y_training) | |
idxPlus=y_training[y_training<0].index | |
idxMin=y_training[y_training>0].index | |
plt.scatter(X_training[idxPlus,0],X_training[idxPlus,1],c='b',s=50) | |
plt.scatter(X_training[idxMin,0],X_training[idxMin,1],c='r',s=50) | |
plt.legend(target_names,loc=3) | |
X,Y = np.mgrid[2:18:100j,4:22:100j] | |
Z = svc.decision_function(np.c_[X.ravel(),Y.ravel()]) | |
Z = Z.reshape(X.shape) | |
plt.contourf(X,Y,Z > 0,alpha=0.4) | |
plt.contour(X,Y,Z,colors=['k'], linestyles=['-'],levels=[0]) | |
plt.title('Non-Linearly Separable') | |
plt.savefig('nl-chart1.png') | |
# In[24]: | |
svc = svm.SVC(kernel='linear',C=100).fit(X_training,y_training) | |
idxPlus=y_training[y_training<0].index | |
idxMin=y_training[y_training>0].index | |
plt.scatter(X_training[idxPlus,0],X_training[idxPlus,1],c='b',s=50) | |
plt.scatter(X_training[idxMin,0],X_training[idxMin,1],c='r',s=50) | |
plt.legend(['-1','+1'],loc=2) | |
X,Y = np.mgrid[2:18:100j,4:22:100j] | |
Z = svc.decision_function(np.c_[X.ravel(),Y.ravel()]) | |
Z = Z.reshape(X.shape) | |
plt.contourf(X,Y,Z > 0,alpha=0.4) | |
plt.contour(X,Y,Z,colors=['k','k','k'], linestyles=['--','-','--'],levels=[-1,0,1]) | |
plt.scatter(svc.support_vectors_[:,0],svc.support_vectors_[:,1],s=120,facecolors='none') | |
plt.scatter(X_training[:,0],X_training[:,1],c=y_training,s=50,alpha=0.95); | |
plt.title('Linear Kernel with regularization') | |
plt.savefig('nl-chart-regularization.png') | |
svc.score(X_training,y_training) | |
# ## Training Performance | |
# | |
# In[25]: | |
svc.score(X_training,y_training) | |
# In[26]: | |
y_pred=svc.predict(X_training) | |
y_true = y_training | |
metric.accuracy_score(y_true, y_pred, normalize=False) | |
# Using linear kernel, we found 4 support vectors but one out of 20 data is in the wrong side. Thus the accuracy is 19 / 20 * 100 = 94.99%. | |
# | |
# It seems we are stuck with one misclassification. | |
# | |
# | |
# ## Training with Polynomial Kernel | |
# | |
# Now it is the time to change the kernel into non-linear kernel. Let us try to use polynomial kernel. | |
# | |
# There is no misclassification if the degree = 2 and above. | |
# | |
# You can play with the degree and regularization parameter C. | |
# + Will the number support vectors increase if you increase the degree? | |
# + Will the number support vectors increase if you increase the regularization parameter? | |
# | |
# In[27]: | |
svc = svm.SVC(kernel='poly',C=1, degree=2).fit(X_training,y_training) | |
idxPlus=y_training[y_training<0].index | |
idxMin=y_training[y_training>0].index | |
plt.scatter(X_training[idxPlus,0],X_training[idxPlus,1],c='b',s=50) | |
plt.scatter(X_training[idxMin,0],X_training[idxMin,1],c='r',s=50) | |
plt.legend(['-1','+1'],loc=2) | |
X,Y = np.mgrid[2:17:100j,0:21:100j] | |
Z = svc.decision_function(np.c_[X.ravel(),Y.ravel()]) | |
Z = Z.reshape(X.shape) | |
plt.contourf(X,Y,Z > 0,alpha=0.4) | |
plt.contour(X,Y,Z,colors=['k','k','k'], linestyles=['--','-','--'],levels=[-1,0,1]) | |
plt.scatter(svc.support_vectors_[:,0],svc.support_vectors_[:,1],s=120,facecolors='none') | |
plt.scatter(X_training[:,0],X_training[:,1],c=y_training,s=50,alpha=0.95); | |
plt.title('Polynomial Kernel') | |
plt.savefig('nl-poly2-kernel.png') | |
svc.score(X_training,y_training) | |
# ## Overfitting | |
# | |
# Just to give rough idea of overfitting, now let us use RBF kernel. | |
# | |
# In[28]: | |
svc = svm.SVC(kernel='rbf',C=1, gamma=3).fit(X_training,y_training) | |
idxPlus=y_training[y_training<0].index | |
idxMin=y_training[y_training>0].index | |
plt.scatter(X_training[idxPlus,0],X_training[idxPlus,1],c='b',s=50) | |
plt.scatter(X_training[idxMin,0],X_training[idxMin,1],c='r',s=50) | |
plt.legend(['-1','+1'],loc=2) | |
X,Y = np.mgrid[2:17:100j,0:21:100j] | |
Z = svc.decision_function(np.c_[X.ravel(),Y.ravel()]) | |
Z = Z.reshape(X.shape) | |
plt.contourf(X,Y,Z > 0,alpha=0.4) | |
plt.contour(X,Y,Z,colors=['k','k','k'], linestyles=['--','-','--'],levels=[-1,0,1]) | |
plt.scatter(svc.support_vectors_[:,0],svc.support_vectors_[:,1],s=120,facecolors='none') | |
plt.scatter(X_training[:,0],X_training[:,1],c=y_training,s=50,alpha=0.95); | |
plt.title('RBF Kernel gamma 1') | |
plt.savefig('nl-RBF1-kernel.png') | |
svc.score(X_training,y_training) | |
# In[29]: | |
y_pred=svc.predict(X_training) | |
y_true = y_training | |
metric.accuracy_score(y_true, y_pred, normalize=False) | |
# The accuracy of the training has reached 100% (20/20) due to the RBF kernel. However, look at the support vectors. The whole dataset becomes the support vectors and it is an indication that the RBF model overfit our data. Adding more data may need to change the model. | |
# | |
# Thus, in our example the Polynomial model with lower degree (degree 2) works better. | |
# # One Dimensional SVM | |
# | |
# This problem at first may seem to be simpler problem. We have X in one dimension. However, the challenge is the SVM training in scikit-learn does not accept 1D training data. | |
# | |
# ## Data | |
# In[30]: | |
filedata='SVM_Dataset3.csv' | |
data3=pd.read_csv(filedata) | |
data3 | |
# The actual X data is just one dimension. The training and scatter plot will not work. | |
# | |
# To make it works, we need to add dummy X2 such that we can plot using scatter plot and train using SVM. | |
# In[31]: | |
X1=data3['x'] | |
X2=np.ones((len(X),1),int) | |
X_training=np.array(list(zip(X1,X2))) | |
X_training | |
# In[32]: | |
y_training=data3['y'] | |
y_training | |
# In[33]: | |
idxPlus=y_training[y_training<0].index | |
idxMin=y_training[y_training>0].index | |
plt.scatter(X_training[idxPlus,0],X_training[idxPlus,1],c='b',s=50) | |
plt.scatter(X_training[idxMin,0],X_training[idxMin,1],c='r',s=50) | |
plt.legend(['-1','+1'],loc=3) | |
plt.xlabel('X1') | |
plt.ylabel('X2'); | |
plt.savefig('data3-chart0.png') | |
# ## SVM Training and Performance | |
# | |
# Now let us do the SVM training to this dataset and plot | |
# In[34]: | |
svc = svm.SVC(kernel='poly',C=1, degree=2).fit(X_training,y_training) | |
idxPlus=y_training[y_training<0].index | |
idxMin=y_training[y_training>0].index | |
plt.scatter(X_training[idxPlus,0],X_training[idxPlus,1],c='b',s=50) | |
plt.scatter(X_training[idxMin,0],X_training[idxMin,1],c='r',s=50) | |
plt.legend(['-1','+1'],loc=2) | |
X,Y = np.mgrid[-2:14:100j,0.98:1.02:100j] | |
Z = svc.decision_function(np.c_[X.ravel(),Y.ravel()]) | |
Z = Z.reshape(X.shape) | |
plt.contourf(X,Y,Z > 0,alpha=0.4) | |
plt.contour(X,Y,Z,colors=['k','k','k'], linestyles=['--','-','--'],levels=[-1,0,1]) | |
plt.scatter(svc.support_vectors_[:,0],svc.support_vectors_[:,1],s=120,facecolors='none') | |
plt.scatter(X_training[:,0],X_training[:,1],c=y_training,s=50,alpha=0.95); | |
plt.title('Polynomial Kernel') | |
plt.savefig('data3-Poly-kernel.png') | |
svc.score(X_training,y_training) | |
# In[35]: | |
svc = svm.SVC(kernel='poly',C=1, degree=3).fit(X_training,y_training) | |
idxPlus=y_training[y_training<0].index | |
idxMin=y_training[y_training>0].index | |
plt.scatter(X_training[idxPlus,0],X_training[idxPlus,1],c='b',s=50) | |
plt.scatter(X_training[idxMin,0],X_training[idxMin,1],c='r',s=50) | |
plt.legend(['-1','+1'],loc=2) | |
X,Y = np.mgrid[-2:14:100j,0.98:1.02:100j] | |
Z = svc.decision_function(np.c_[X.ravel(),Y.ravel()]) | |
Z = Z.reshape(X.shape) | |
plt.contourf(X,Y,Z > 0,alpha=0.4) | |
plt.contour(X,Y,Z,colors=['k','k','k'], linestyles=['--','-','--'],levels=[-1,0,1]) | |
plt.scatter(svc.support_vectors_[:,0],svc.support_vectors_[:,1],s=120,facecolors='none') | |
plt.scatter(X_training[:,0],X_training[:,1],c=y_training,s=50,alpha=0.95); | |
plt.title('Polynomial Kernel') | |
plt.savefig('data3-Poly-kernel.png') | |
svc.score(X_training,y_training) | |
# We have three support vector for Polynomial degree 3 and training accuracy of 100%. This should be the best model. | |
# ## Overfitting | |
# | |
# Just for curiosity, we can also try to use RBF kernel | |
# In[36]: | |
svc = svm.SVC(kernel='rbf',C=1, gamma=3).fit(X_training,y_training) | |
idxPlus=y_training[y_training<0].index | |
idxMin=y_training[y_training>0].index | |
plt.scatter(X_training[idxPlus,0],X_training[idxPlus,1],c='b',s=50) | |
plt.scatter(X_training[idxMin,0],X_training[idxMin,1],c='r',s=50) | |
plt.legend(['-1','+1'],loc=2) | |
X,Y = np.mgrid[-2:14:100j,0.98:1.02:100j] | |
Z = svc.decision_function(np.c_[X.ravel(),Y.ravel()]) | |
Z = Z.reshape(X.shape) | |
plt.contourf(X,Y,Z > 0,alpha=0.4) | |
plt.contour(X,Y,Z,colors=['k','k','k'], linestyles=['--','-','--'],levels=[-1,0,1]) | |
plt.scatter(svc.support_vectors_[:,0],svc.support_vectors_[:,1],s=120,facecolors='none') | |
plt.scatter(X_training[:,0],X_training[:,1],c=y_training,s=50,alpha=0.95); | |
plt.title('RBF Kernel') | |
plt.savefig('data3-RBF-kernel.png') | |
svc.score(X_training,y_training) | |
# Just as before, the accuracy is 100% but the whole data sets now become support vectors. It is a clear sign of overfitting. That means the model memorize the data rather than generalizing it. Thus, we should avoid to use RBF for this dataset. | |
# | |
# The support vectors in SVM are the quality data that we can use to generate the decision boundary (of the same model). Non-support vector data can be ignored, regardless how many data that you have. That is one of the strengths of SVM. | |
# Visit www.Revoledu.com for more tutorials in Data Science |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
The HTML version is available in my web page:
http://people.revoledu.com/kardi/tutorial/SVM/SVM+in+Python.html