Experiment on logic regression
#!/usr/bin/env python3 | |
# -*- coding: utf-8 -*- | |
from sklearn.datasets import load_iris | |
import numpy as np | |
import math | |
import matplotlib.pyplot as plt | |
from mpl_toolkits.mplot3d import Axes3D | |
from scipy.optimize import fmin_bfgs | |
#============================================================================== | |
# Circular Data | |
data_circular = np.array([[0.051267,0.69956,1], | |
[-0.092742,0.68494,1], | |
[-0.21371,0.69225,1], | |
[-0.375,0.50219,1], | |
[-0.51325,0.46564,1], | |
[-0.52477,0.2098,1], | |
[-0.39804,0.034357,1], | |
[-0.30588,-0.19225,1], | |
[0.016705,-0.40424,1], | |
[0.13191,-0.51389,1], | |
[0.38537,-0.56506,1], | |
[0.52938,-0.5212,1], | |
[0.63882,-0.24342,1], | |
[0.73675,-0.18494,1], | |
[0.54666,0.48757,1], | |
[0.322,0.5826,1], | |
[0.16647,0.53874,1], | |
[-0.046659,0.81652,1], | |
[-0.17339,0.69956,1], | |
[-0.47869,0.63377,1], | |
[-0.60541,0.59722,1], | |
[-0.62846,0.33406,1], | |
[-0.59389,0.005117,1], | |
[-0.42108,-0.27266,1], | |
[-0.11578,-0.39693,1], | |
[0.20104,-0.60161,1], | |
[0.46601,-0.53582,1], | |
[0.67339,-0.53582,1], | |
[-0.13882,0.54605,1], | |
[-0.29435,0.77997,1], | |
[-0.26555,0.96272,1], | |
[-0.16187,0.8019,1], | |
[-0.17339,0.64839,1], | |
[-0.28283,0.47295,1], | |
[-0.36348,0.31213,1], | |
[-0.30012,0.027047,1], | |
[-0.23675,-0.21418,1], | |
[-0.06394,-0.18494,1], | |
[0.062788,-0.16301,1], | |
[0.22984,-0.41155,1], | |
[0.2932,-0.2288,1], | |
[0.48329,-0.18494,1], | |
[0.64459,-0.14108,1], | |
[0.46025,0.012427,1], | |
[0.6273,0.15863,1], | |
[0.57546,0.26827,1], | |
[0.72523,0.44371,1], | |
[0.22408,0.52412,1], | |
[0.44297,0.67032,1], | |
[0.322,0.69225,1], | |
[0.13767,0.57529,1], | |
[-0.0063364,0.39985,1], | |
[-0.092742,0.55336,1], | |
[-0.20795,0.35599,1], | |
[-0.20795,0.17325,1], | |
[-0.43836,0.21711,1], | |
[-0.21947,-0.016813,1], | |
[-0.13882,-0.27266,1], | |
[0.18376,0.93348,0], | |
[0.22408,0.77997,0], | |
[0.29896,0.61915,0], | |
[0.50634,0.75804,0], | |
[0.61578,0.7288,0], | |
[0.60426,0.59722,0], | |
[0.76555,0.50219,0], | |
[0.92684,0.3633,0], | |
[0.82316,0.27558,0], | |
[0.96141,0.085526,0], | |
[0.93836,0.012427,0], | |
[0.86348,-0.082602,0], | |
[0.89804,-0.20687,0], | |
[0.85196,-0.36769,0], | |
[0.82892,-0.5212,0], | |
[0.79435,-0.55775,0], | |
[0.59274,-0.7405,0], | |
[0.51786,-0.5943,0], | |
[0.46601,-0.41886,0], | |
[0.35081,-0.57968,0], | |
[0.28744,-0.76974,0], | |
[0.085829,-0.75512,0], | |
[0.14919,-0.57968,0], | |
[-0.13306,-0.4481,0], | |
[-0.40956,-0.41155,0], | |
[-0.39228,-0.25804,0], | |
[-0.74366,-0.25804,0], | |
[-0.69758,0.041667,0], | |
[-0.75518,0.2902,0], | |
[-0.69758,0.68494,0], | |
[-0.4038,0.70687,0], | |
[-0.38076,0.91886,0], | |
[-0.50749,0.90424,0], | |
[-0.54781,0.70687,0], | |
[0.10311,0.77997,0], | |
[0.057028,0.91886,0], | |
[-0.10426,0.99196,0], | |
[-0.081221,1.1089,0], | |
[0.28744,1.087,0], | |
[0.39689,0.82383,0], | |
[0.63882,0.88962,0], | |
[0.82316,0.66301,0], | |
[0.67339,0.64108,0], | |
[1.0709,0.10015,0], | |
[-0.046659,-0.57968,0], | |
[-0.23675,-0.63816,0], | |
[-0.15035,-0.36769,0], | |
[-0.49021,-0.3019,0], | |
[-0.46717,-0.13377,0], | |
[-0.28859,-0.060673,0], | |
[-0.61118,-0.067982,0], | |
[-0.66302,-0.21418,0], | |
[-0.59965,-0.41886,0], | |
[-0.72638,-0.082602,0], | |
[-0.83007,0.31213,0], | |
[-0.72062,0.53874,0], | |
[-0.59389,0.49488,0], | |
[-0.48445,0.99927,0], | |
[-0.0063364,0.99927,0], | |
[0.63265,-0.030612,0]]) | |
#============================================================================== | |
# helpers to plot decision boundary | |
# http://scikit-learn.org/stable/auto_examples/svm/plot_iris.html#sphx-glr-auto-examples-svm-plot-iris-py | |
def make_meshgrid(x, y, h=.02): | |
x_min, x_max = x.min() - 1, x.max() + 1 | |
y_min, y_max = y.min() - 1, y.max() + 1 | |
return np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) | |
def plot_contours(predict, xx, yy, **params): | |
Z = predict(np.c_[xx.ravel(), yy.ravel()]) | |
Z = Z.reshape(xx.shape) | |
ax = params.get('ax', plt) | |
return ax.contourf(xx, yy, Z, **params) | |
#============================================================================== | |
# Implementation of Logistic Regression | |
def sigmoid(x): | |
return 1/(1+np.exp(-x)) | |
def h(X, theta): | |
"""the hypothesis function | |
:X: a matrix, each row is a sample with columns of features | |
:theta: an array represents the parameter of the regression array([theta_0, theta_1, ...]) | |
:returns: array([h_0, h_1]), hypothes of each data | |
""" | |
product = np.matmul(X, theta) | |
return sigmoid(product) | |
def gen_predict(theta): | |
def predict(X): | |
return np.where(h(X, theta) >= 0.5, 1, 0) | |
return predict | |
def gen_cost(data, labels, h, **params): | |
""" | |
:data: a matrix array([[col, col], ...]) | |
:labels: an array that contains the label for all data | |
:returns: a cost function that takes theta and return (current_cost, array([gradient])) | |
""" | |
m = len(data) # number of samples | |
x_trans = data.transpose() | |
y = labels | |
def cost(theta): | |
""":returns: (current_cost, gradient)""" | |
hx = h(data, theta) | |
J = sum(- y*np.log(hx) - (1-y)*np.log(1-hx)) / m | |
grad = np.matmul(x_trans, (hx-y)) / m | |
return (J, grad) | |
return cost | |
def gen_cost_regularized(data, labels, h, l=1, **params): | |
m = len(data) # number of samples | |
x_trans = data.transpose() | |
y = labels | |
def cost(theta): | |
""":returns: (current_cost, gradient)""" | |
hx = h(data, theta) | |
J = sum(- y*np.log(hx) - (1-y)*np.log(1-hx)) / m + l/2/m * sum(np.square(theta)) | |
grad = np.matmul(x_trans, (hx-y)) / m + l/m*theta | |
return (J, grad) | |
return cost | |
def gradDescent(X, Y, gen_cost=gen_cost, iterations=1000, a=0.1, tol=1e-3, record_cost=False): | |
"""train a group of theta | |
:returns: a vector represents theta | |
""" | |
cost = gen_cost(X, Y, h) | |
rows, cols = X.shape | |
theta = np.zeros(cols) | |
# now train | |
costs = [] | |
for i in range(iterations): | |
cur_cost, grad = cost(theta) | |
print(f"current_cost: {cur_cost}") | |
if record_cost: | |
costs.append([i, cur_cost]) | |
if cur_cost < tol: | |
print(f'ends with cost: {cur_cost}') | |
break | |
theta = theta - a * grad | |
if record_cost: | |
return (theta, np.array(costs)) | |
return theta | |
# train logistic for iris | |
iris = load_iris() | |
X = iris.data[iris.target != 2] # drop the 3rd category | |
Y = iris.target[iris.target != 2] | |
# split training set and test sets | |
ids = np.random.permutation(X.shape[0]) | |
train_ids, test_ids = ids[:70], ids[70:] | |
X_train, Y_train = X[train_ids, :], Y[train_ids] | |
X_test, Y_test = X[test_ids, :], Y[test_ids] | |
# train | |
rows, cols = X_train.shape | |
# bias = np.ones(rows) | |
# X_train = np.insert(X_train, 0, bias, axis=1) | |
theta = gradDescent(X_train, Y_train, iterations=100) | |
predict = gen_predict(theta) | |
predicted = predict(X_test) | |
print(f"correct rate: {np.count_nonzero(predicted==Y_test)/len(Y_test)*100}%") | |
# plot 2D | |
dims = (1, 2) | |
X0, X1 = X[:, dims[0]], X[:, dims[1]] | |
xx, yy = make_meshgrid(X0, X1) | |
# plot the data | |
predict = gen_predict(theta[list(dims)]) | |
plot_contours(predict, xx, yy, cmap=plt.cm.Blues, alpha=0.8) | |
plt.scatter(X0, X1, c=np.where(Y==0, 'red', 'blue'), alpha=0.8) | |
plt.show() | |
#============================================================================== | |
# plot decision boundary of Logistic Regression in 3D | |
# train for only 2 features | |
X = iris.data[iris.target != 2][:, [1, 2]] | |
Y = iris.target[iris.target != 2] | |
X_0, X_1 = X[Y==0], X[Y==1] | |
theta = gradDescent(X, Y, iterations=100) | |
predict = lambda X: h(X, theta) | |
xx, yy = make_meshgrid(X[:, 0], X[:, 1], h=0.01) | |
Z = predict(np.c_[xx.ravel(), yy.ravel()]).reshape(xx.shape) | |
fig = plt.figure() | |
ax = fig.add_subplot(111, projection='3d') | |
ax.scatter(X_0[:, 0], X_0[:, 1], c='r') | |
ax.scatter(X_1[:, 0], X_1[:, 1], zs=1, c='g') | |
ax.plot_wireframe(xx, yy, Z, alpha=0.8) | |
ax.contourf(xx, yy, Z, 1, zdir='z', offset=0) | |
plt.show() | |
#============================================================================== | |
X = data_circular[:,[0,1]] | |
Y = data_circular[:,2] | |
plt.scatter(X[:,0], X[:,1], c=np.where(Y==0, 'red', 'blue'), alpha=0.8) | |
plt.show() | |
# add non-linear features | |
def add_non_linear_features(X0, X1): | |
ret = [] | |
for rank in range(1, 7): | |
for rank_x0 in range(rank+1): | |
ret.append((X0**rank_x0) * (X1**(rank-rank_x0))) | |
return np.column_stack(ret) | |
X_mapping = add_non_linear_features(X[:,0], X[:,1]) | |
# use fmin_bfgs to train, gradDescent is too slow for this | |
cost = gen_cost(X_mapping, Y, h) | |
cost_for_bfgs = lambda theta: cost(theta)[0] | |
gradient_for_bfgs = lambda theta: cost(theta)[1] | |
initial_theta = np.zeros(X_mapping.shape[1]) | |
theta = fmin_bfgs(cost_for_bfgs, initial_theta, fprime=gradient_for_bfgs, gtol=1e-6) | |
# theta = gradDescent(X_mapping, Y, iterations=1000000, a=10) | |
# plot decision boundary in 2D | |
predict = gen_predict(theta) | |
xx, yy = make_meshgrid(X[:,0], X[:,1], h=.01) | |
Z = predict(add_non_linear_features(xx.ravel(), yy.ravel())) | |
Z = Z.reshape(xx.shape) | |
plt.contourf(xx, yy, Z, alpha=0.8) | |
plt.scatter(X[:,0], X[:,1], c=np.where(Y==0, 'red', 'blue'), alpha=0.8) | |
plt.show() | |
#============================================================================== | |
# prove that square cost won't work | |
samples = [(-5, 1), (-20, 0), (-2, 1)] | |
def cost(theta): | |
def sigmoid(theta, x): | |
return 1/(1 + math.e**(- theta*x)) | |
diffs = [(sigmoid(theta, x) - y) for x,y in samples] | |
return sum(diff * diff for diff in diffs)/len(samples)/2 | |
X = np.arange(-1, 1, 0.01) | |
Y = np.array([cost(theta) for theta in X]) | |
plt.plot(X, Y) | |
plt.show() | |
#============================================================================== | |
# Visually grasp the impact of cost function | |
X = iris.data[iris.target != 2][:, [1, 2]] | |
Y = iris.target[iris.target != 2] | |
X_0, X_1 = X[Y==0], X[Y==1] | |
cost = gen_cost(X, Y, h) | |
theta_good = gradDescent(X, Y, iterations=100) | |
predict_good = lambda X: h(X, theta_good) | |
theta_bad = gradDescent(X, Y, iterations=25) | |
predict_bad = lambda X: h(X, theta_bad) | |
xx, yy = make_meshgrid(X[:, 0], X[:, 1], h=0.01) | |
Z_good = predict_good(np.c_[xx.ravel(), yy.ravel()]).reshape(xx.shape) | |
Z_bad = predict_bad(np.c_[xx.ravel(), yy.ravel()]).reshape(xx.shape) | |
# plot 3D | |
fig = plt.figure() | |
ax = fig.add_subplot(111, projection='3d') | |
ax.scatter(X_0[:, 0], X_0[:, 1], c='r') | |
ax.scatter(X_1[:, 0], X_1[:, 1], zs=1, c='g') | |
ax.plot_wireframe(xx, yy, Z_good, alpha=0.8) | |
ax.plot_wireframe(xx, yy, Z_bad, alpha=0.8, color='green') | |
plt.show() | |
# plot 2D | |
predict_good = gen_predict(theta_good) | |
predict_bad = gen_predict(theta_bad) | |
fig = plt.figure() | |
# ax1 = fig.add_subplot(121) | |
CS = plot_contours(predict_good, xx, yy, alpha=0.5) | |
plt.clabel(CS, CS.levels[:2], fmt={CS.levels[1]: 'cost = 0.08'}, inline=True, fontsize=10) | |
CS = plot_contours(predict_bad, xx, yy, alpha=0.5) | |
plt.clabel(CS, CS.levels[:2], fmt={CS.levels[1]: 'cost = 0.25'}, inline=True, fontsize=10) | |
plt.scatter(X_0[:, 0], X_0[:, 1], c='r', alpha=0.8) | |
plt.scatter(X_1[:, 0], X_1[:, 1], c='g', alpha=0.8) | |
plt.savefig('cost-function-2D.svg') | |
plt.show() | |
#============================================================================== | |
# Regularize experiment | |
X = data_circular[:,[0,1]] | |
Y = data_circular[:,2] | |
# add non-linear features | |
def add_non_linear_features(X0, X1): | |
ret = [] | |
for rank in range(1, 7): | |
for rank_x0 in range(rank+1): | |
ret.append((X0**rank_x0) * (X1**(rank-rank_x0))) | |
return np.column_stack(ret) | |
X_mapping = add_non_linear_features(X[:,0], X[:,1]) | |
# use fmin_bfgs to train, gradDescent is too slow for this | |
def train(gen_cost, **params): | |
initial_theta = np.zeros(X_mapping.shape[1]) | |
cost = gen_cost(X_mapping, Y, h, **params) | |
cost_for_bfgs = lambda theta: cost(theta)[0] | |
gradient_for_bfgs = lambda theta: cost(theta)[1] | |
return fmin_bfgs(cost_for_bfgs, initial_theta, fprime=gradient_for_bfgs) | |
# return gradDescent(X_mapping, Y, gen_cost_regularized, **params) | |
theta_not_regularized = train(gen_cost) | |
theta_regularized = train(gen_cost_regularized, l=0.005) | |
# plot decision boundary in 2D | |
def plot_circular(theta, plt=plt): | |
predict = gen_predict(theta) | |
xx, yy = make_meshgrid(X[:,0], X[:,1], h=.01) | |
Z = predict(add_non_linear_features(xx.ravel(), yy.ravel())) | |
Z = Z.reshape(xx.shape) | |
plt.scatter(X[:,0], X[:,1], c=np.where(Y==0, 'red', 'blue'), alpha=0.8) | |
plt.contourf(xx, yy, Z, alpha=0.6) | |
fig = plt.figure() | |
ax1 = fig.add_subplot(221) | |
l0 = train(gen_cost_regularized, l=0) | |
plot_circular(l0, plt=ax1) | |
ax1.set_title(r'$\lambda = 0$') | |
ax2 = fig.add_subplot(222) | |
l1 = train(gen_cost_regularized, l=0.001) | |
plot_circular(l1, plt=ax2) | |
ax2.set_title(r'$\lambda = 0.001$') | |
ax3 = fig.add_subplot(223) | |
l2 = train(gen_cost_regularized, l=0.01) | |
plot_circular(l2, plt=ax3) | |
ax3.set_title(r'$\lambda = 0.01$') | |
ax4 = fig.add_subplot(224) | |
l3 = train(gen_cost_regularized, l=0.1) | |
plot_circular(l3, plt=ax4) | |
ax4.set_title(r'$\lambda = 0.1$') | |
plt.show() | |
#============================================================================== | |
# Scaling | |
import sklearn.preprocessing as preprocessing | |
scaler = preprocessing.StandardScaler() | |
X_mapping_scaled = scaler.fit_transform(X_mapping) | |
theta_normal, costs_normal = gradDescent(X_mapping, Y, gen_cost, record_cost=True) | |
theta_scaled, costs_scaled = gradDescent(X_mapping_scaled, Y, gen_cost, record_cost=True) | |
not_scaled, = plt.plot(costs_normal[:,0], costs_normal[:,1], label='Not Scaled') | |
scaled, = plt.plot(costs_scaled[:,0], costs_scaled[:,1], label='Scaled') | |
plt.legend([not_scaled, scaled], ['Not scaled', 'Scaled']) | |
plt.xlabel('Number of Iterations') | |
plt.ylabel(r'$J(\theta)$') | |
plt.savefig('learning-rate-scaled.svg') | |
#============================================================================== | |
# With or without Bias | |
X = iris.data[iris.target != 2][:, [2, 3]] | |
Y = iris.target[iris.target != 2] | |
X_0, X_1 = X[Y==0], X[Y==1] | |
def add_bias(X): | |
bias = np.ones(X.shape[0]) | |
return np.insert(X, 0, bias, axis=1) | |
theta_no_bias = gradDescent(X, Y, iterations=1000) | |
predict_no_bias = lambda X: np.where(h(X, theta_no_bias)>=0.5, 1, 0) | |
X_with_bias = add_bias(X) | |
theta_with_bias = gradDescent(X_with_bias, Y, iterations=1000) | |
predict_with_bias = lambda X: np.where(h(add_bias(X), theta_with_bias)>=0.5, 1, 0) | |
x_min, x_max = -1, X[:,0].max() + 1 | |
y_min, y_max = X[:,1].min() - 1, X[:, 1].max() + 1 | |
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.01), np.arange(y_min, y_max, 0.01)) | |
fig = plt.figure() | |
CS = plot_contours(predict_no_bias, xx, yy, alpha=0.5) | |
plt.clabel(CS, CS.levels[:2], fmt={CS.levels[1]: 'No Bias'}, inline=True, fontsize=16) | |
CS = plot_contours(predict_with_bias, xx, yy, alpha=0.5) | |
plt.clabel(CS, CS.levels[:2], fmt={CS.levels[1]: 'Bias'}, inline=True, fontsize=16) | |
plt.scatter(X_0[:, 0], X_0[:, 1], c='r', alpha=0.8) | |
plt.scatter(X_1[:, 0], X_1[:, 1], c='g', alpha=0.8) | |
plt.plot(np.linspace(-1,0), np.zeros(len(np.linspace(-1,0))), 'c--') | |
plt.plot(np.zeros(len(np.linspace(y_min,0))), np.linspace(y_min,0), 'c--') | |
plt.show() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment