Create a gist now

Instantly share code, notes, and snippets.

What would you like to do?
Experiment on logic regression
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from sklearn.datasets import load_iris
import numpy as np
import math
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from scipy.optimize import fmin_bfgs
#==============================================================================
# Circular Data
data_circular = np.array([[0.051267,0.69956,1],
[-0.092742,0.68494,1],
[-0.21371,0.69225,1],
[-0.375,0.50219,1],
[-0.51325,0.46564,1],
[-0.52477,0.2098,1],
[-0.39804,0.034357,1],
[-0.30588,-0.19225,1],
[0.016705,-0.40424,1],
[0.13191,-0.51389,1],
[0.38537,-0.56506,1],
[0.52938,-0.5212,1],
[0.63882,-0.24342,1],
[0.73675,-0.18494,1],
[0.54666,0.48757,1],
[0.322,0.5826,1],
[0.16647,0.53874,1],
[-0.046659,0.81652,1],
[-0.17339,0.69956,1],
[-0.47869,0.63377,1],
[-0.60541,0.59722,1],
[-0.62846,0.33406,1],
[-0.59389,0.005117,1],
[-0.42108,-0.27266,1],
[-0.11578,-0.39693,1],
[0.20104,-0.60161,1],
[0.46601,-0.53582,1],
[0.67339,-0.53582,1],
[-0.13882,0.54605,1],
[-0.29435,0.77997,1],
[-0.26555,0.96272,1],
[-0.16187,0.8019,1],
[-0.17339,0.64839,1],
[-0.28283,0.47295,1],
[-0.36348,0.31213,1],
[-0.30012,0.027047,1],
[-0.23675,-0.21418,1],
[-0.06394,-0.18494,1],
[0.062788,-0.16301,1],
[0.22984,-0.41155,1],
[0.2932,-0.2288,1],
[0.48329,-0.18494,1],
[0.64459,-0.14108,1],
[0.46025,0.012427,1],
[0.6273,0.15863,1],
[0.57546,0.26827,1],
[0.72523,0.44371,1],
[0.22408,0.52412,1],
[0.44297,0.67032,1],
[0.322,0.69225,1],
[0.13767,0.57529,1],
[-0.0063364,0.39985,1],
[-0.092742,0.55336,1],
[-0.20795,0.35599,1],
[-0.20795,0.17325,1],
[-0.43836,0.21711,1],
[-0.21947,-0.016813,1],
[-0.13882,-0.27266,1],
[0.18376,0.93348,0],
[0.22408,0.77997,0],
[0.29896,0.61915,0],
[0.50634,0.75804,0],
[0.61578,0.7288,0],
[0.60426,0.59722,0],
[0.76555,0.50219,0],
[0.92684,0.3633,0],
[0.82316,0.27558,0],
[0.96141,0.085526,0],
[0.93836,0.012427,0],
[0.86348,-0.082602,0],
[0.89804,-0.20687,0],
[0.85196,-0.36769,0],
[0.82892,-0.5212,0],
[0.79435,-0.55775,0],
[0.59274,-0.7405,0],
[0.51786,-0.5943,0],
[0.46601,-0.41886,0],
[0.35081,-0.57968,0],
[0.28744,-0.76974,0],
[0.085829,-0.75512,0],
[0.14919,-0.57968,0],
[-0.13306,-0.4481,0],
[-0.40956,-0.41155,0],
[-0.39228,-0.25804,0],
[-0.74366,-0.25804,0],
[-0.69758,0.041667,0],
[-0.75518,0.2902,0],
[-0.69758,0.68494,0],
[-0.4038,0.70687,0],
[-0.38076,0.91886,0],
[-0.50749,0.90424,0],
[-0.54781,0.70687,0],
[0.10311,0.77997,0],
[0.057028,0.91886,0],
[-0.10426,0.99196,0],
[-0.081221,1.1089,0],
[0.28744,1.087,0],
[0.39689,0.82383,0],
[0.63882,0.88962,0],
[0.82316,0.66301,0],
[0.67339,0.64108,0],
[1.0709,0.10015,0],
[-0.046659,-0.57968,0],
[-0.23675,-0.63816,0],
[-0.15035,-0.36769,0],
[-0.49021,-0.3019,0],
[-0.46717,-0.13377,0],
[-0.28859,-0.060673,0],
[-0.61118,-0.067982,0],
[-0.66302,-0.21418,0],
[-0.59965,-0.41886,0],
[-0.72638,-0.082602,0],
[-0.83007,0.31213,0],
[-0.72062,0.53874,0],
[-0.59389,0.49488,0],
[-0.48445,0.99927,0],
[-0.0063364,0.99927,0],
[0.63265,-0.030612,0]])
#==============================================================================
# helpers to plot decision boundary
# http://scikit-learn.org/stable/auto_examples/svm/plot_iris.html#sphx-glr-auto-examples-svm-plot-iris-py
def make_meshgrid(x, y, h=.02):
x_min, x_max = x.min() - 1, x.max() + 1
y_min, y_max = y.min() - 1, y.max() + 1
return np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
def plot_contours(predict, xx, yy, **params):
Z = predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
ax = params.get('ax', plt)
return ax.contourf(xx, yy, Z, **params)
#==============================================================================
# Implementation of Logistic Regression
def sigmoid(x):
return 1/(1+np.exp(-x))
def h(X, theta):
"""the hypothesis function
:X: a matrix, each row is a sample with columns of features
:theta: an array represents the parameter of the regression array([theta_0, theta_1, ...])
:returns: array([h_0, h_1]), hypothes of each data
"""
product = np.matmul(X, theta)
return sigmoid(product)
def gen_predict(theta):
def predict(X):
return np.where(h(X, theta) >= 0.5, 1, 0)
return predict
def gen_cost(data, labels, h, **params):
"""
:data: a matrix array([[col, col], ...])
:labels: an array that contains the label for all data
:returns: a cost function that takes theta and return (current_cost, array([gradient]))
"""
m = len(data) # number of samples
x_trans = data.transpose()
y = labels
def cost(theta):
""":returns: (current_cost, gradient)"""
hx = h(data, theta)
J = sum(- y*np.log(hx) - (1-y)*np.log(1-hx)) / m
grad = np.matmul(x_trans, (hx-y)) / m
return (J, grad)
return cost
def gen_cost_regularized(data, labels, h, l=1, **params):
m = len(data) # number of samples
x_trans = data.transpose()
y = labels
def cost(theta):
""":returns: (current_cost, gradient)"""
hx = h(data, theta)
J = sum(- y*np.log(hx) - (1-y)*np.log(1-hx)) / m + l/2/m * sum(np.square(theta))
grad = np.matmul(x_trans, (hx-y)) / m + l/m*theta
return (J, grad)
return cost
def gradDescent(X, Y, gen_cost=gen_cost, iterations=1000, a=0.1, tol=1e-3, record_cost=False):
"""train a group of theta
:returns: a vector represents theta
"""
cost = gen_cost(X, Y, h)
rows, cols = X.shape
theta = np.zeros(cols)
# now train
costs = []
for i in range(iterations):
cur_cost, grad = cost(theta)
print(f"current_cost: {cur_cost}")
if record_cost:
costs.append([i, cur_cost])
if cur_cost < tol:
print(f'ends with cost: {cur_cost}')
break
theta = theta - a * grad
if record_cost:
return (theta, np.array(costs))
return theta
# train logistic for iris
iris = load_iris()
X = iris.data[iris.target != 2] # drop the 3rd category
Y = iris.target[iris.target != 2]
# split training set and test sets
ids = np.random.permutation(X.shape[0])
train_ids, test_ids = ids[:70], ids[70:]
X_train, Y_train = X[train_ids, :], Y[train_ids]
X_test, Y_test = X[test_ids, :], Y[test_ids]
# train
rows, cols = X_train.shape
# bias = np.ones(rows)
# X_train = np.insert(X_train, 0, bias, axis=1)
theta = gradDescent(X_train, Y_train, iterations=100)
predict = gen_predict(theta)
predicted = predict(X_test)
print(f"correct rate: {np.count_nonzero(predicted==Y_test)/len(Y_test)*100}%")
# plot 2D
dims = (1, 2)
X0, X1 = X[:, dims[0]], X[:, dims[1]]
xx, yy = make_meshgrid(X0, X1)
# plot the data
predict = gen_predict(theta[list(dims)])
plot_contours(predict, xx, yy, cmap=plt.cm.Blues, alpha=0.8)
plt.scatter(X0, X1, c=np.where(Y==0, 'red', 'blue'), alpha=0.8)
plt.show()
#==============================================================================
# plot decision boundary of Logistic Regression in 3D
# train for only 2 features
X = iris.data[iris.target != 2][:, [1, 2]]
Y = iris.target[iris.target != 2]
X_0, X_1 = X[Y==0], X[Y==1]
theta = gradDescent(X, Y, iterations=100)
predict = lambda X: h(X, theta)
xx, yy = make_meshgrid(X[:, 0], X[:, 1], h=0.01)
Z = predict(np.c_[xx.ravel(), yy.ravel()]).reshape(xx.shape)
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(X_0[:, 0], X_0[:, 1], c='r')
ax.scatter(X_1[:, 0], X_1[:, 1], zs=1, c='g')
ax.plot_wireframe(xx, yy, Z, alpha=0.8)
ax.contourf(xx, yy, Z, 1, zdir='z', offset=0)
plt.show()
#==============================================================================
X = data_circular[:,[0,1]]
Y = data_circular[:,2]
plt.scatter(X[:,0], X[:,1], c=np.where(Y==0, 'red', 'blue'), alpha=0.8)
plt.show()
# add non-linear features
def add_non_linear_features(X0, X1):
ret = []
for rank in range(1, 7):
for rank_x0 in range(rank+1):
ret.append((X0**rank_x0) * (X1**(rank-rank_x0)))
return np.column_stack(ret)
X_mapping = add_non_linear_features(X[:,0], X[:,1])
# use fmin_bfgs to train, gradDescent is too slow for this
cost = gen_cost(X_mapping, Y, h)
cost_for_bfgs = lambda theta: cost(theta)[0]
gradient_for_bfgs = lambda theta: cost(theta)[1]
initial_theta = np.zeros(X_mapping.shape[1])
theta = fmin_bfgs(cost_for_bfgs, initial_theta, fprime=gradient_for_bfgs, gtol=1e-6)
# theta = gradDescent(X_mapping, Y, iterations=1000000, a=10)
# plot decision boundary in 2D
predict = gen_predict(theta)
xx, yy = make_meshgrid(X[:,0], X[:,1], h=.01)
Z = predict(add_non_linear_features(xx.ravel(), yy.ravel()))
Z = Z.reshape(xx.shape)
plt.contourf(xx, yy, Z, alpha=0.8)
plt.scatter(X[:,0], X[:,1], c=np.where(Y==0, 'red', 'blue'), alpha=0.8)
plt.show()
#==============================================================================
# prove that square cost won't work
samples = [(-5, 1), (-20, 0), (-2, 1)]
def cost(theta):
def sigmoid(theta, x):
return 1/(1 + math.e**(- theta*x))
diffs = [(sigmoid(theta, x) - y) for x,y in samples]
return sum(diff * diff for diff in diffs)/len(samples)/2
X = np.arange(-1, 1, 0.01)
Y = np.array([cost(theta) for theta in X])
plt.plot(X, Y)
plt.show()
#==============================================================================
# Visually grasp the impact of cost function
X = iris.data[iris.target != 2][:, [1, 2]]
Y = iris.target[iris.target != 2]
X_0, X_1 = X[Y==0], X[Y==1]
cost = gen_cost(X, Y, h)
theta_good = gradDescent(X, Y, iterations=100)
predict_good = lambda X: h(X, theta_good)
theta_bad = gradDescent(X, Y, iterations=25)
predict_bad = lambda X: h(X, theta_bad)
xx, yy = make_meshgrid(X[:, 0], X[:, 1], h=0.01)
Z_good = predict_good(np.c_[xx.ravel(), yy.ravel()]).reshape(xx.shape)
Z_bad = predict_bad(np.c_[xx.ravel(), yy.ravel()]).reshape(xx.shape)
# plot 3D
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(X_0[:, 0], X_0[:, 1], c='r')
ax.scatter(X_1[:, 0], X_1[:, 1], zs=1, c='g')
ax.plot_wireframe(xx, yy, Z_good, alpha=0.8)
ax.plot_wireframe(xx, yy, Z_bad, alpha=0.8, color='green')
plt.show()
# plot 2D
predict_good = gen_predict(theta_good)
predict_bad = gen_predict(theta_bad)
fig = plt.figure()
# ax1 = fig.add_subplot(121)
CS = plot_contours(predict_good, xx, yy, alpha=0.5)
plt.clabel(CS, CS.levels[:2], fmt={CS.levels[1]: 'cost = 0.08'}, inline=True, fontsize=10)
CS = plot_contours(predict_bad, xx, yy, alpha=0.5)
plt.clabel(CS, CS.levels[:2], fmt={CS.levels[1]: 'cost = 0.25'}, inline=True, fontsize=10)
plt.scatter(X_0[:, 0], X_0[:, 1], c='r', alpha=0.8)
plt.scatter(X_1[:, 0], X_1[:, 1], c='g', alpha=0.8)
plt.savefig('cost-function-2D.svg')
plt.show()
#==============================================================================
# Regularize experiment
X = data_circular[:,[0,1]]
Y = data_circular[:,2]
# add non-linear features
def add_non_linear_features(X0, X1):
ret = []
for rank in range(1, 7):
for rank_x0 in range(rank+1):
ret.append((X0**rank_x0) * (X1**(rank-rank_x0)))
return np.column_stack(ret)
X_mapping = add_non_linear_features(X[:,0], X[:,1])
# use fmin_bfgs to train, gradDescent is too slow for this
def train(gen_cost, **params):
initial_theta = np.zeros(X_mapping.shape[1])
cost = gen_cost(X_mapping, Y, h, **params)
cost_for_bfgs = lambda theta: cost(theta)[0]
gradient_for_bfgs = lambda theta: cost(theta)[1]
return fmin_bfgs(cost_for_bfgs, initial_theta, fprime=gradient_for_bfgs)
# return gradDescent(X_mapping, Y, gen_cost_regularized, **params)
theta_not_regularized = train(gen_cost)
theta_regularized = train(gen_cost_regularized, l=0.005)
# plot decision boundary in 2D
def plot_circular(theta, plt=plt):
predict = gen_predict(theta)
xx, yy = make_meshgrid(X[:,0], X[:,1], h=.01)
Z = predict(add_non_linear_features(xx.ravel(), yy.ravel()))
Z = Z.reshape(xx.shape)
plt.scatter(X[:,0], X[:,1], c=np.where(Y==0, 'red', 'blue'), alpha=0.8)
plt.contourf(xx, yy, Z, alpha=0.6)
fig = plt.figure()
ax1 = fig.add_subplot(221)
l0 = train(gen_cost_regularized, l=0)
plot_circular(l0, plt=ax1)
ax1.set_title(r'$\lambda = 0$')
ax2 = fig.add_subplot(222)
l1 = train(gen_cost_regularized, l=0.001)
plot_circular(l1, plt=ax2)
ax2.set_title(r'$\lambda = 0.001$')
ax3 = fig.add_subplot(223)
l2 = train(gen_cost_regularized, l=0.01)
plot_circular(l2, plt=ax3)
ax3.set_title(r'$\lambda = 0.01$')
ax4 = fig.add_subplot(224)
l3 = train(gen_cost_regularized, l=0.1)
plot_circular(l3, plt=ax4)
ax4.set_title(r'$\lambda = 0.1$')
plt.show()
#==============================================================================
# Scaling
import sklearn.preprocessing as preprocessing
scaler = preprocessing.StandardScaler()
X_mapping_scaled = scaler.fit_transform(X_mapping)
theta_normal, costs_normal = gradDescent(X_mapping, Y, gen_cost, record_cost=True)
theta_scaled, costs_scaled = gradDescent(X_mapping_scaled, Y, gen_cost, record_cost=True)
not_scaled, = plt.plot(costs_normal[:,0], costs_normal[:,1], label='Not Scaled')
scaled, = plt.plot(costs_scaled[:,0], costs_scaled[:,1], label='Scaled')
plt.legend([not_scaled, scaled], ['Not scaled', 'Scaled'])
plt.xlabel('Number of Iterations')
plt.ylabel(r'$J(\theta)$')
plt.savefig('learning-rate-scaled.svg')
#==============================================================================
# With or without Bias
X = iris.data[iris.target != 2][:, [2, 3]]
Y = iris.target[iris.target != 2]
X_0, X_1 = X[Y==0], X[Y==1]
def add_bias(X):
bias = np.ones(X.shape[0])
return np.insert(X, 0, bias, axis=1)
theta_no_bias = gradDescent(X, Y, iterations=1000)
predict_no_bias = lambda X: np.where(h(X, theta_no_bias)>=0.5, 1, 0)
X_with_bias = add_bias(X)
theta_with_bias = gradDescent(X_with_bias, Y, iterations=1000)
predict_with_bias = lambda X: np.where(h(add_bias(X), theta_with_bias)>=0.5, 1, 0)
x_min, x_max = -1, X[:,0].max() + 1
y_min, y_max = X[:,1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.01), np.arange(y_min, y_max, 0.01))
fig = plt.figure()
CS = plot_contours(predict_no_bias, xx, yy, alpha=0.5)
plt.clabel(CS, CS.levels[:2], fmt={CS.levels[1]: 'No Bias'}, inline=True, fontsize=16)
CS = plot_contours(predict_with_bias, xx, yy, alpha=0.5)
plt.clabel(CS, CS.levels[:2], fmt={CS.levels[1]: 'Bias'}, inline=True, fontsize=16)
plt.scatter(X_0[:, 0], X_0[:, 1], c='r', alpha=0.8)
plt.scatter(X_1[:, 0], X_1[:, 1], c='g', alpha=0.8)
plt.plot(np.linspace(-1,0), np.zeros(len(np.linspace(-1,0))), 'c--')
plt.plot(np.zeros(len(np.linspace(y_min,0))), np.linspace(y_min,0), 'c--')
plt.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment