lotabout/logistic-regression-experiment.py

## logistic-regression-experiment.py
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

from sklearn.datasets import load_iris
import numpy as np
import math
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from scipy.optimize import fmin_bfgs

#==============================================================================
# Circular Data
data_circular = np.array([[0.051267,0.69956,1],
    [-0.092742,0.68494,1],
    [-0.21371,0.69225,1],
    [-0.375,0.50219,1],
    [-0.51325,0.46564,1],
    [-0.52477,0.2098,1],
    [-0.39804,0.034357,1],
    [-0.30588,-0.19225,1],
    [0.016705,-0.40424,1],
    [0.13191,-0.51389,1],
    [0.38537,-0.56506,1],
    [0.52938,-0.5212,1],
    [0.63882,-0.24342,1],
    [0.73675,-0.18494,1],
    [0.54666,0.48757,1],
    [0.322,0.5826,1],
    [0.16647,0.53874,1],
    [-0.046659,0.81652,1],
    [-0.17339,0.69956,1],
    [-0.47869,0.63377,1],
    [-0.60541,0.59722,1],
    [-0.62846,0.33406,1],
    [-0.59389,0.005117,1],
    [-0.42108,-0.27266,1],
    [-0.11578,-0.39693,1],
    [0.20104,-0.60161,1],
    [0.46601,-0.53582,1],
    [0.67339,-0.53582,1],
    [-0.13882,0.54605,1],
    [-0.29435,0.77997,1],
    [-0.26555,0.96272,1],
    [-0.16187,0.8019,1],
    [-0.17339,0.64839,1],
    [-0.28283,0.47295,1],
    [-0.36348,0.31213,1],
    [-0.30012,0.027047,1],
    [-0.23675,-0.21418,1],
    [-0.06394,-0.18494,1],
    [0.062788,-0.16301,1],
    [0.22984,-0.41155,1],
    [0.2932,-0.2288,1],
    [0.48329,-0.18494,1],
    [0.64459,-0.14108,1],
    [0.46025,0.012427,1],
    [0.6273,0.15863,1],
    [0.57546,0.26827,1],
    [0.72523,0.44371,1],
    [0.22408,0.52412,1],
    [0.44297,0.67032,1],
    [0.322,0.69225,1],
    [0.13767,0.57529,1],
    [-0.0063364,0.39985,1],
    [-0.092742,0.55336,1],
    [-0.20795,0.35599,1],
    [-0.20795,0.17325,1],
    [-0.43836,0.21711,1],
    [-0.21947,-0.016813,1],
    [-0.13882,-0.27266,1],
    [0.18376,0.93348,0],
    [0.22408,0.77997,0],
    [0.29896,0.61915,0],
    [0.50634,0.75804,0],
    [0.61578,0.7288,0],
    [0.60426,0.59722,0],
    [0.76555,0.50219,0],
    [0.92684,0.3633,0],
    [0.82316,0.27558,0],
    [0.96141,0.085526,0],
    [0.93836,0.012427,0],
    [0.86348,-0.082602,0],
    [0.89804,-0.20687,0],
    [0.85196,-0.36769,0],
    [0.82892,-0.5212,0],
    [0.79435,-0.55775,0],
    [0.59274,-0.7405,0],
    [0.51786,-0.5943,0],
    [0.46601,-0.41886,0],
    [0.35081,-0.57968,0],
    [0.28744,-0.76974,0],
    [0.085829,-0.75512,0],
    [0.14919,-0.57968,0],
    [-0.13306,-0.4481,0],
    [-0.40956,-0.41155,0],
    [-0.39228,-0.25804,0],
    [-0.74366,-0.25804,0],
    [-0.69758,0.041667,0],
    [-0.75518,0.2902,0],
    [-0.69758,0.68494,0],
    [-0.4038,0.70687,0],
    [-0.38076,0.91886,0],
    [-0.50749,0.90424,0],
    [-0.54781,0.70687,0],
    [0.10311,0.77997,0],
    [0.057028,0.91886,0],
    [-0.10426,0.99196,0],
    [-0.081221,1.1089,0],
    [0.28744,1.087,0],
    [0.39689,0.82383,0],
    [0.63882,0.88962,0],
    [0.82316,0.66301,0],
    [0.67339,0.64108,0],
    [1.0709,0.10015,0],
    [-0.046659,-0.57968,0],
    [-0.23675,-0.63816,0],
    [-0.15035,-0.36769,0],
    [-0.49021,-0.3019,0],
    [-0.46717,-0.13377,0],
    [-0.28859,-0.060673,0],
    [-0.61118,-0.067982,0],
    [-0.66302,-0.21418,0],
    [-0.59965,-0.41886,0],
    [-0.72638,-0.082602,0],
    [-0.83007,0.31213,0],
    [-0.72062,0.53874,0],
    [-0.59389,0.49488,0],
    [-0.48445,0.99927,0],
    [-0.0063364,0.99927,0],
    [0.63265,-0.030612,0]])

#==============================================================================
# helpers to plot decision boundary
# http://scikit-learn.org/stable/auto_examples/svm/plot_iris.html#sphx-glr-auto-examples-svm-plot-iris-py

def make_meshgrid(x, y, h=.02):
    x_min, x_max = x.min() - 1, x.max() + 1
    y_min, y_max = y.min() - 1, y.max() + 1

    return np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

def plot_contours(predict, xx, yy, **params):
    Z = predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    ax = params.get('ax', plt)
    return ax.contourf(xx, yy, Z, **params)

#==============================================================================
# Implementation of Logistic Regression

def sigmoid(x):
    return 1/(1+np.exp(-x))

def h(X, theta):
    """the hypothesis function

    :X: a matrix, each row is a sample with columns of features
    :theta: an array represents the parameter of the regression array([theta_0, theta_1, ...])
    :returns: array([h_0, h_1]), hypothes of each data

    """
    product = np.matmul(X, theta)
    return sigmoid(product)

def gen_predict(theta):
    def predict(X):
        return np.where(h(X, theta) >= 0.5, 1, 0)
    return predict

def gen_cost(data, labels, h, **params):
    """

    :data: a matrix array([[col, col], ...])
    :labels: an array that contains the label for all data
    :returns: a cost function that takes theta and return (current_cost, array([gradient]))

    """
    m = len(data) # number of samples
    x_trans = data.transpose()
    y = labels

    def cost(theta):
        """:returns: (current_cost, gradient)"""
        hx = h(data, theta)

        J = sum(- y*np.log(hx) - (1-y)*np.log(1-hx)) / m
        grad = np.matmul(x_trans, (hx-y)) / m
        return (J, grad)
    return cost

def gen_cost_regularized(data, labels, h, l=1, **params):
    m = len(data) # number of samples
    x_trans = data.transpose()
    y = labels

    def cost(theta):
        """:returns: (current_cost, gradient)"""
        hx = h(data, theta)

        J = sum(- y*np.log(hx) - (1-y)*np.log(1-hx)) / m + l/2/m * sum(np.square(theta))
        grad = np.matmul(x_trans, (hx-y)) / m + l/m*theta
        return (J, grad)
    return cost

def gradDescent(X, Y, gen_cost=gen_cost, iterations=1000, a=0.1, tol=1e-3, record_cost=False):
    """train a group of theta

    :returns: a vector represents theta

    """

    cost = gen_cost(X, Y, h)
    rows, cols = X.shape
    theta = np.zeros(cols)

    # now train
    costs = []
    for i in range(iterations):
        cur_cost, grad = cost(theta)
        print(f"current_cost: {cur_cost}")

        if record_cost:
            costs.append([i, cur_cost])

        if cur_cost < tol:
            print(f'ends with cost: {cur_cost}')
            break

        theta = theta - a * grad

    if record_cost:
        return (theta, np.array(costs))

    return theta


# train logistic for iris
iris = load_iris()
X = iris.data[iris.target != 2]   # drop the 3rd category
Y = iris.target[iris.target != 2]

# split training set and test sets
ids = np.random.permutation(X.shape[0])
train_ids, test_ids = ids[:70], ids[70:]
X_train, Y_train = X[train_ids, :], Y[train_ids]
X_test, Y_test = X[test_ids, :], Y[test_ids]

# train
rows, cols = X_train.shape
# bias = np.ones(rows)
# X_train = np.insert(X_train, 0, bias, axis=1)
theta = gradDescent(X_train, Y_train, iterations=100)
predict = gen_predict(theta)

predicted = predict(X_test)
print(f"correct rate: {np.count_nonzero(predicted==Y_test)/len(Y_test)*100}%")

# plot 2D
dims = (1, 2)
X0, X1 = X[:, dims[0]], X[:, dims[1]]
xx, yy = make_meshgrid(X0, X1)

# plot the data
predict = gen_predict(theta[list(dims)])
plot_contours(predict, xx, yy, cmap=plt.cm.Blues, alpha=0.8)
plt.scatter(X0, X1, c=np.where(Y==0, 'red', 'blue'), alpha=0.8)
plt.show()

#==============================================================================
# plot decision boundary of Logistic Regression in 3D

# train for only 2 features
X = iris.data[iris.target != 2][:, [1, 2]]
Y = iris.target[iris.target != 2]
X_0, X_1 = X[Y==0], X[Y==1]

theta = gradDescent(X, Y, iterations=100)
predict = lambda X: h(X, theta)

xx, yy = make_meshgrid(X[:, 0], X[:, 1], h=0.01)
Z = predict(np.c_[xx.ravel(), yy.ravel()]).reshape(xx.shape)

fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(X_0[:, 0], X_0[:, 1], c='r')
ax.scatter(X_1[:, 0], X_1[:, 1], zs=1, c='g')
ax.plot_wireframe(xx, yy, Z, alpha=0.8)
ax.contourf(xx, yy, Z, 1, zdir='z', offset=0)
plt.show()

#==============================================================================

X = data_circular[:,[0,1]]
Y = data_circular[:,2]

plt.scatter(X[:,0], X[:,1], c=np.where(Y==0, 'red', 'blue'), alpha=0.8)
plt.show()

# add non-linear features
def add_non_linear_features(X0, X1):
    ret = []
    for rank in range(1, 7):
        for rank_x0 in range(rank+1):
            ret.append((X0**rank_x0) * (X1**(rank-rank_x0)))
    return np.column_stack(ret)

X_mapping = add_non_linear_features(X[:,0], X[:,1])

# use fmin_bfgs to train, gradDescent is too slow for this
cost = gen_cost(X_mapping, Y, h)
cost_for_bfgs = lambda theta: cost(theta)[0]
gradient_for_bfgs = lambda theta: cost(theta)[1]

initial_theta = np.zeros(X_mapping.shape[1])
theta = fmin_bfgs(cost_for_bfgs, initial_theta, fprime=gradient_for_bfgs, gtol=1e-6)
# theta = gradDescent(X_mapping, Y, iterations=1000000, a=10)

# plot decision boundary in 2D
predict = gen_predict(theta)
xx, yy = make_meshgrid(X[:,0], X[:,1], h=.01)
Z = predict(add_non_linear_features(xx.ravel(), yy.ravel()))
Z = Z.reshape(xx.shape)
plt.contourf(xx, yy, Z, alpha=0.8)

plt.scatter(X[:,0], X[:,1], c=np.where(Y==0, 'red', 'blue'), alpha=0.8)
plt.show()

#==============================================================================
# prove that square cost won't work
samples = [(-5, 1), (-20, 0), (-2, 1)]


def cost(theta):
    def sigmoid(theta, x):
        return 1/(1 + math.e**(- theta*x))
    diffs = [(sigmoid(theta, x) - y) for x,y in samples]
    return sum(diff * diff for diff in diffs)/len(samples)/2

X = np.arange(-1, 1, 0.01)
Y = np.array([cost(theta) for theta in X])
plt.plot(X, Y)
plt.show()

#==============================================================================
# Visually grasp the impact of cost function
X = iris.data[iris.target != 2][:, [1, 2]]
Y = iris.target[iris.target != 2]
X_0, X_1 = X[Y==0], X[Y==1]
cost = gen_cost(X, Y, h)

theta_good = gradDescent(X, Y, iterations=100)
predict_good = lambda X: h(X, theta_good)
theta_bad = gradDescent(X, Y, iterations=25)
predict_bad = lambda X: h(X, theta_bad)

xx, yy = make_meshgrid(X[:, 0], X[:, 1], h=0.01)
Z_good = predict_good(np.c_[xx.ravel(), yy.ravel()]).reshape(xx.shape)
Z_bad = predict_bad(np.c_[xx.ravel(), yy.ravel()]).reshape(xx.shape)

# plot 3D
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(X_0[:, 0], X_0[:, 1], c='r')
ax.scatter(X_1[:, 0], X_1[:, 1], zs=1, c='g')
ax.plot_wireframe(xx, yy, Z_good, alpha=0.8)
ax.plot_wireframe(xx, yy, Z_bad, alpha=0.8, color='green')
plt.show()


# plot 2D
predict_good = gen_predict(theta_good)
predict_bad = gen_predict(theta_bad)

fig = plt.figure()
# ax1 = fig.add_subplot(121)
CS = plot_contours(predict_good, xx, yy, alpha=0.5)
plt.clabel(CS, CS.levels[:2], fmt={CS.levels[1]: 'cost = 0.08'}, inline=True, fontsize=10)
CS = plot_contours(predict_bad, xx, yy, alpha=0.5)
plt.clabel(CS, CS.levels[:2], fmt={CS.levels[1]: 'cost = 0.25'}, inline=True, fontsize=10)
plt.scatter(X_0[:, 0], X_0[:, 1], c='r', alpha=0.8)
plt.scatter(X_1[:, 0], X_1[:, 1], c='g', alpha=0.8)
plt.savefig('cost-function-2D.svg')
plt.show()

#==============================================================================
# Regularize experiment

X = data_circular[:,[0,1]]
Y = data_circular[:,2]

# add non-linear features
def add_non_linear_features(X0, X1):
    ret = []
    for rank in range(1, 7):
        for rank_x0 in range(rank+1):
            ret.append((X0**rank_x0) * (X1**(rank-rank_x0)))
    return np.column_stack(ret)

X_mapping = add_non_linear_features(X[:,0], X[:,1])

# use fmin_bfgs to train, gradDescent is too slow for this
def train(gen_cost, **params):
    initial_theta = np.zeros(X_mapping.shape[1])

    cost = gen_cost(X_mapping, Y, h, **params)
    cost_for_bfgs = lambda theta: cost(theta)[0]
    gradient_for_bfgs = lambda theta: cost(theta)[1]
    return fmin_bfgs(cost_for_bfgs, initial_theta, fprime=gradient_for_bfgs)
    # return gradDescent(X_mapping, Y, gen_cost_regularized, **params)

theta_not_regularized = train(gen_cost)
theta_regularized = train(gen_cost_regularized, l=0.005)

# plot decision boundary in 2D
def plot_circular(theta, plt=plt):
    predict = gen_predict(theta)
    xx, yy = make_meshgrid(X[:,0], X[:,1], h=.01)
    Z = predict(add_non_linear_features(xx.ravel(), yy.ravel()))
    Z = Z.reshape(xx.shape)

    plt.scatter(X[:,0], X[:,1], c=np.where(Y==0, 'red', 'blue'), alpha=0.8)
    plt.contourf(xx, yy, Z, alpha=0.6)

fig = plt.figure()

ax1 = fig.add_subplot(221)
l0  = train(gen_cost_regularized, l=0)
plot_circular(l0, plt=ax1)
ax1.set_title(r'$\lambda = 0$')

ax2 = fig.add_subplot(222)
l1  = train(gen_cost_regularized, l=0.001)
plot_circular(l1, plt=ax2)
ax2.set_title(r'$\lambda = 0.001$')

ax3 = fig.add_subplot(223)
l2  = train(gen_cost_regularized, l=0.01)
plot_circular(l2, plt=ax3)
ax3.set_title(r'$\lambda = 0.01$')

ax4 = fig.add_subplot(224)
l3  = train(gen_cost_regularized, l=0.1)
plot_circular(l3, plt=ax4)
ax4.set_title(r'$\lambda = 0.1$')

plt.show()

#==============================================================================
# Scaling

import sklearn.preprocessing as preprocessing
scaler = preprocessing.StandardScaler()
X_mapping_scaled = scaler.fit_transform(X_mapping)
theta_normal, costs_normal = gradDescent(X_mapping, Y, gen_cost, record_cost=True)
theta_scaled, costs_scaled = gradDescent(X_mapping_scaled, Y, gen_cost, record_cost=True)

not_scaled, = plt.plot(costs_normal[:,0], costs_normal[:,1], label='Not Scaled')
scaled, = plt.plot(costs_scaled[:,0], costs_scaled[:,1], label='Scaled')
plt.legend([not_scaled, scaled], ['Not scaled', 'Scaled'])
plt.xlabel('Number of Iterations')
plt.ylabel(r'$J(\theta)$')
plt.savefig('learning-rate-scaled.svg')

#==============================================================================
# With or without Bias


X = iris.data[iris.target != 2][:, [2, 3]]
Y = iris.target[iris.target != 2]
X_0, X_1 = X[Y==0], X[Y==1]

def add_bias(X):
    bias = np.ones(X.shape[0])
    return np.insert(X, 0, bias, axis=1)

theta_no_bias = gradDescent(X, Y, iterations=1000)
predict_no_bias = lambda X: np.where(h(X, theta_no_bias)>=0.5, 1, 0)

X_with_bias = add_bias(X)
theta_with_bias = gradDescent(X_with_bias, Y, iterations=1000)
predict_with_bias = lambda X: np.where(h(add_bias(X), theta_with_bias)>=0.5, 1, 0)

x_min, x_max = -1, X[:,0].max() + 1
y_min, y_max = X[:,1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.01), np.arange(y_min, y_max, 0.01))

fig = plt.figure()
CS = plot_contours(predict_no_bias, xx, yy, alpha=0.5)
plt.clabel(CS, CS.levels[:2], fmt={CS.levels[1]: 'No Bias'}, inline=True, fontsize=16)
CS = plot_contours(predict_with_bias, xx, yy, alpha=0.5)
plt.clabel(CS, CS.levels[:2], fmt={CS.levels[1]: 'Bias'}, inline=True, fontsize=16)
plt.scatter(X_0[:, 0], X_0[:, 1], c='r', alpha=0.8)
plt.scatter(X_1[:, 0], X_1[:, 1], c='g', alpha=0.8)

plt.plot(np.linspace(-1,0), np.zeros(len(np.linspace(-1,0))), 'c--')
plt.plot(np.zeros(len(np.linspace(y_min,0))), np.linspace(y_min,0), 'c--')
plt.show()
	#!/usr/bin/env python3
	# -- coding: utf-8 --

	from sklearn.datasets import load_iris
	import numpy as np
	import math
	import matplotlib.pyplot as plt
	from mpl_toolkits.mplot3d import Axes3D
	from scipy.optimize import fmin_bfgs

	#==============================================================================
	# Circular Data
	data_circular = np.array([[0.051267,0.69956,1],
	[-0.092742,0.68494,1],
	[-0.21371,0.69225,1],
	[-0.375,0.50219,1],
	[-0.51325,0.46564,1],
	[-0.52477,0.2098,1],
	[-0.39804,0.034357,1],
	[-0.30588,-0.19225,1],
	[0.016705,-0.40424,1],
	[0.13191,-0.51389,1],
	[0.38537,-0.56506,1],
	[0.52938,-0.5212,1],
	[0.63882,-0.24342,1],
	[0.73675,-0.18494,1],
	[0.54666,0.48757,1],
	[0.322,0.5826,1],
	[0.16647,0.53874,1],
	[-0.046659,0.81652,1],
	[-0.17339,0.69956,1],
	[-0.47869,0.63377,1],
	[-0.60541,0.59722,1],
	[-0.62846,0.33406,1],
	[-0.59389,0.005117,1],
	[-0.42108,-0.27266,1],
	[-0.11578,-0.39693,1],
	[0.20104,-0.60161,1],
	[0.46601,-0.53582,1],
	[0.67339,-0.53582,1],
	[-0.13882,0.54605,1],
	[-0.29435,0.77997,1],
	[-0.26555,0.96272,1],
	[-0.16187,0.8019,1],
	[-0.17339,0.64839,1],
	[-0.28283,0.47295,1],
	[-0.36348,0.31213,1],
	[-0.30012,0.027047,1],
	[-0.23675,-0.21418,1],
	[-0.06394,-0.18494,1],
	[0.062788,-0.16301,1],
	[0.22984,-0.41155,1],
	[0.2932,-0.2288,1],
	[0.48329,-0.18494,1],
	[0.64459,-0.14108,1],
	[0.46025,0.012427,1],
	[0.6273,0.15863,1],
	[0.57546,0.26827,1],
	[0.72523,0.44371,1],
	[0.22408,0.52412,1],
	[0.44297,0.67032,1],
	[0.322,0.69225,1],
	[0.13767,0.57529,1],
	[-0.0063364,0.39985,1],
	[-0.092742,0.55336,1],
	[-0.20795,0.35599,1],
	[-0.20795,0.17325,1],
	[-0.43836,0.21711,1],
	[-0.21947,-0.016813,1],
	[-0.13882,-0.27266,1],
	[0.18376,0.93348,0],
	[0.22408,0.77997,0],
	[0.29896,0.61915,0],
	[0.50634,0.75804,0],
	[0.61578,0.7288,0],
	[0.60426,0.59722,0],
	[0.76555,0.50219,0],
	[0.92684,0.3633,0],
	[0.82316,0.27558,0],
	[0.96141,0.085526,0],
	[0.93836,0.012427,0],
	[0.86348,-0.082602,0],
	[0.89804,-0.20687,0],
	[0.85196,-0.36769,0],
	[0.82892,-0.5212,0],
	[0.79435,-0.55775,0],
	[0.59274,-0.7405,0],
	[0.51786,-0.5943,0],
	[0.46601,-0.41886,0],
	[0.35081,-0.57968,0],
	[0.28744,-0.76974,0],
	[0.085829,-0.75512,0],
	[0.14919,-0.57968,0],
	[-0.13306,-0.4481,0],
	[-0.40956,-0.41155,0],
	[-0.39228,-0.25804,0],
	[-0.74366,-0.25804,0],
	[-0.69758,0.041667,0],
	[-0.75518,0.2902,0],
	[-0.69758,0.68494,0],
	[-0.4038,0.70687,0],
	[-0.38076,0.91886,0],
	[-0.50749,0.90424,0],
	[-0.54781,0.70687,0],
	[0.10311,0.77997,0],
	[0.057028,0.91886,0],
	[-0.10426,0.99196,0],
	[-0.081221,1.1089,0],
	[0.28744,1.087,0],
	[0.39689,0.82383,0],
	[0.63882,0.88962,0],
	[0.82316,0.66301,0],
	[0.67339,0.64108,0],
	[1.0709,0.10015,0],
	[-0.046659,-0.57968,0],
	[-0.23675,-0.63816,0],
	[-0.15035,-0.36769,0],
	[-0.49021,-0.3019,0],
	[-0.46717,-0.13377,0],
	[-0.28859,-0.060673,0],
	[-0.61118,-0.067982,0],
	[-0.66302,-0.21418,0],
	[-0.59965,-0.41886,0],
	[-0.72638,-0.082602,0],
	[-0.83007,0.31213,0],
	[-0.72062,0.53874,0],
	[-0.59389,0.49488,0],
	[-0.48445,0.99927,0],
	[-0.0063364,0.99927,0],
	[0.63265,-0.030612,0]])

	#==============================================================================
	# helpers to plot decision boundary
	# http://scikit-learn.org/stable/auto_examples/svm/plot_iris.html#sphx-glr-auto-examples-svm-plot-iris-py

	def make_meshgrid(x, y, h=.02):
	x_min, x_max = x.min() - 1, x.max() + 1
	y_min, y_max = y.min() - 1, y.max() + 1

	return np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

	def plot_contours(predict, xx, yy, **params):
	Z = predict(np.c_[xx.ravel(), yy.ravel()])
	Z = Z.reshape(xx.shape)
	ax = params.get('ax', plt)
	return ax.contourf(xx, yy, Z, **params)

	#==============================================================================
	# Implementation of Logistic Regression

	def sigmoid(x):
	return 1/(1+np.exp(-x))

	def h(X, theta):
	"""the hypothesis function

	:X: a matrix, each row is a sample with columns of features
	:theta: an array represents the parameter of the regression array([theta_0, theta_1, ...])
	:returns: array([h_0, h_1]), hypothes of each data

	"""
	product = np.matmul(X, theta)
	return sigmoid(product)

	def gen_predict(theta):
	def predict(X):
	return np.where(h(X, theta) >= 0.5, 1, 0)
	return predict

	def gen_cost(data, labels, h, **params):
	"""

	:data: a matrix array([[col, col], ...])
	:labels: an array that contains the label for all data
	:returns: a cost function that takes theta and return (current_cost, array([gradient]))

	"""
	m = len(data) # number of samples
	x_trans = data.transpose()
	y = labels

	def cost(theta):
	""":returns: (current_cost, gradient)"""
	hx = h(data, theta)

	J = sum(- ynp.log(hx) - (1-y)np.log(1-hx)) / m
	grad = np.matmul(x_trans, (hx-y)) / m
	return (J, grad)
	return cost

	def gen_cost_regularized(data, labels, h, l=1, **params):
	m = len(data) # number of samples
	x_trans = data.transpose()
	y = labels

	def cost(theta):
	""":returns: (current_cost, gradient)"""
	hx = h(data, theta)

	J = sum(- ynp.log(hx) - (1-y)np.log(1-hx)) / m + l/2/m * sum(np.square(theta))
	grad = np.matmul(x_trans, (hx-y)) / m + l/m*theta
	return (J, grad)
	return cost

	def gradDescent(X, Y, gen_cost=gen_cost, iterations=1000, a=0.1, tol=1e-3, record_cost=False):
	"""train a group of theta

	:returns: a vector represents theta

	"""

	cost = gen_cost(X, Y, h)
	rows, cols = X.shape
	theta = np.zeros(cols)

	# now train
	costs = []
	for i in range(iterations):
	cur_cost, grad = cost(theta)
	print(f"current_cost: {cur_cost}")

	if record_cost:
	costs.append([i, cur_cost])

	if cur_cost < tol:
	print(f'ends with cost: {cur_cost}')
	break

	theta = theta - a * grad

	if record_cost:
	return (theta, np.array(costs))

	return theta


	# train logistic for iris
	iris = load_iris()
	X = iris.data[iris.target != 2] # drop the 3rd category
	Y = iris.target[iris.target != 2]

	# split training set and test sets
	ids = np.random.permutation(X.shape[0])
	train_ids, test_ids = ids[:70], ids[70:]
	X_train, Y_train = X[train_ids, :], Y[train_ids]
	X_test, Y_test = X[test_ids, :], Y[test_ids]

	# train
	rows, cols = X_train.shape
	# bias = np.ones(rows)
	# X_train = np.insert(X_train, 0, bias, axis=1)
	theta = gradDescent(X_train, Y_train, iterations=100)
	predict = gen_predict(theta)

	predicted = predict(X_test)
	print(f"correct rate: {np.count_nonzero(predicted==Y_test)/len(Y_test)*100}%")

	# plot 2D
	dims = (1, 2)
	X0, X1 = X[:, dims[0]], X[:, dims[1]]
	xx, yy = make_meshgrid(X0, X1)

	# plot the data
	predict = gen_predict(theta[list(dims)])
	plot_contours(predict, xx, yy, cmap=plt.cm.Blues, alpha=0.8)
	plt.scatter(X0, X1, c=np.where(Y==0, 'red', 'blue'), alpha=0.8)
	plt.show()

	#==============================================================================
	# plot decision boundary of Logistic Regression in 3D

	# train for only 2 features
	X = iris.data[iris.target != 2][:, [1, 2]]
	Y = iris.target[iris.target != 2]
	X_0, X_1 = X[Y==0], X[Y==1]

	theta = gradDescent(X, Y, iterations=100)
	predict = lambda X: h(X, theta)

	xx, yy = make_meshgrid(X[:, 0], X[:, 1], h=0.01)
	Z = predict(np.c_[xx.ravel(), yy.ravel()]).reshape(xx.shape)

	fig = plt.figure()
	ax = fig.add_subplot(111, projection='3d')
	ax.scatter(X_0[:, 0], X_0[:, 1], c='r')
	ax.scatter(X_1[:, 0], X_1[:, 1], zs=1, c='g')
	ax.plot_wireframe(xx, yy, Z, alpha=0.8)
	ax.contourf(xx, yy, Z, 1, zdir='z', offset=0)
	plt.show()

	#==============================================================================

	X = data_circular[:,[0,1]]
	Y = data_circular[:,2]

	plt.scatter(X[:,0], X[:,1], c=np.where(Y==0, 'red', 'blue'), alpha=0.8)
	plt.show()

	# add non-linear features
	def add_non_linear_features(X0, X1):
	ret = []
	for rank in range(1, 7):
	for rank_x0 in range(rank+1):
	ret.append((X0*rank_x0) (X1**(rank-rank_x0)))
	return np.column_stack(ret)

	X_mapping = add_non_linear_features(X[:,0], X[:,1])

	# use fmin_bfgs to train, gradDescent is too slow for this
	cost = gen_cost(X_mapping, Y, h)
	cost_for_bfgs = lambda theta: cost(theta)[0]
	gradient_for_bfgs = lambda theta: cost(theta)[1]

	initial_theta = np.zeros(X_mapping.shape[1])
	theta = fmin_bfgs(cost_for_bfgs, initial_theta, fprime=gradient_for_bfgs, gtol=1e-6)
	# theta = gradDescent(X_mapping, Y, iterations=1000000, a=10)

	# plot decision boundary in 2D
	predict = gen_predict(theta)
	xx, yy = make_meshgrid(X[:,0], X[:,1], h=.01)
	Z = predict(add_non_linear_features(xx.ravel(), yy.ravel()))
	Z = Z.reshape(xx.shape)
	plt.contourf(xx, yy, Z, alpha=0.8)

	plt.scatter(X[:,0], X[:,1], c=np.where(Y==0, 'red', 'blue'), alpha=0.8)
	plt.show()

	#==============================================================================
	# prove that square cost won't work
	samples = [(-5, 1), (-20, 0), (-2, 1)]


	def cost(theta):
	def sigmoid(theta, x):
	return 1/(1 + math.e*(- thetax))
	diffs = [(sigmoid(theta, x) - y) for x,y in samples]
	return sum(diff * diff for diff in diffs)/len(samples)/2

	X = np.arange(-1, 1, 0.01)
	Y = np.array([cost(theta) for theta in X])
	plt.plot(X, Y)
	plt.show()

	#==============================================================================
	# Visually grasp the impact of cost function
	X = iris.data[iris.target != 2][:, [1, 2]]
	Y = iris.target[iris.target != 2]
	X_0, X_1 = X[Y==0], X[Y==1]
	cost = gen_cost(X, Y, h)

	theta_good = gradDescent(X, Y, iterations=100)
	predict_good = lambda X: h(X, theta_good)
	theta_bad = gradDescent(X, Y, iterations=25)
	predict_bad = lambda X: h(X, theta_bad)

	xx, yy = make_meshgrid(X[:, 0], X[:, 1], h=0.01)
	Z_good = predict_good(np.c_[xx.ravel(), yy.ravel()]).reshape(xx.shape)
	Z_bad = predict_bad(np.c_[xx.ravel(), yy.ravel()]).reshape(xx.shape)

	# plot 3D
	fig = plt.figure()
	ax = fig.add_subplot(111, projection='3d')
	ax.scatter(X_0[:, 0], X_0[:, 1], c='r')
	ax.scatter(X_1[:, 0], X_1[:, 1], zs=1, c='g')
	ax.plot_wireframe(xx, yy, Z_good, alpha=0.8)
	ax.plot_wireframe(xx, yy, Z_bad, alpha=0.8, color='green')
	plt.show()


	# plot 2D
	predict_good = gen_predict(theta_good)
	predict_bad = gen_predict(theta_bad)

	fig = plt.figure()
	# ax1 = fig.add_subplot(121)
	CS = plot_contours(predict_good, xx, yy, alpha=0.5)
	plt.clabel(CS, CS.levels[:2], fmt={CS.levels[1]: 'cost = 0.08'}, inline=True, fontsize=10)
	CS = plot_contours(predict_bad, xx, yy, alpha=0.5)
	plt.clabel(CS, CS.levels[:2], fmt={CS.levels[1]: 'cost = 0.25'}, inline=True, fontsize=10)
	plt.scatter(X_0[:, 0], X_0[:, 1], c='r', alpha=0.8)
	plt.scatter(X_1[:, 0], X_1[:, 1], c='g', alpha=0.8)
	plt.savefig('cost-function-2D.svg')
	plt.show()

	#==============================================================================
	# Regularize experiment

	X = data_circular[:,[0,1]]
	Y = data_circular[:,2]

	# add non-linear features
	def add_non_linear_features(X0, X1):
	ret = []
	for rank in range(1, 7):
	for rank_x0 in range(rank+1):
	ret.append((X0*rank_x0) (X1**(rank-rank_x0)))
	return np.column_stack(ret)

	X_mapping = add_non_linear_features(X[:,0], X[:,1])

	# use fmin_bfgs to train, gradDescent is too slow for this
	def train(gen_cost, **params):
	initial_theta = np.zeros(X_mapping.shape[1])

	cost = gen_cost(X_mapping, Y, h, **params)
	cost_for_bfgs = lambda theta: cost(theta)[0]
	gradient_for_bfgs = lambda theta: cost(theta)[1]
	return fmin_bfgs(cost_for_bfgs, initial_theta, fprime=gradient_for_bfgs)
	# return gradDescent(X_mapping, Y, gen_cost_regularized, **params)

	theta_not_regularized = train(gen_cost)
	theta_regularized = train(gen_cost_regularized, l=0.005)

	# plot decision boundary in 2D
	def plot_circular(theta, plt=plt):
	predict = gen_predict(theta)
	xx, yy = make_meshgrid(X[:,0], X[:,1], h=.01)
	Z = predict(add_non_linear_features(xx.ravel(), yy.ravel()))
	Z = Z.reshape(xx.shape)

	plt.scatter(X[:,0], X[:,1], c=np.where(Y==0, 'red', 'blue'), alpha=0.8)
	plt.contourf(xx, yy, Z, alpha=0.6)

	fig = plt.figure()

	ax1 = fig.add_subplot(221)
	l0 = train(gen_cost_regularized, l=0)
	plot_circular(l0, plt=ax1)
	ax1.set_title(r'$\lambda = 0$')

	ax2 = fig.add_subplot(222)
	l1 = train(gen_cost_regularized, l=0.001)
	plot_circular(l1, plt=ax2)
	ax2.set_title(r'$\lambda = 0.001$')

	ax3 = fig.add_subplot(223)
	l2 = train(gen_cost_regularized, l=0.01)
	plot_circular(l2, plt=ax3)
	ax3.set_title(r'$\lambda = 0.01$')

	ax4 = fig.add_subplot(224)
	l3 = train(gen_cost_regularized, l=0.1)
	plot_circular(l3, plt=ax4)
	ax4.set_title(r'$\lambda = 0.1$')

	plt.show()

	#==============================================================================
	# Scaling

	import sklearn.preprocessing as preprocessing
	scaler = preprocessing.StandardScaler()
	X_mapping_scaled = scaler.fit_transform(X_mapping)
	theta_normal, costs_normal = gradDescent(X_mapping, Y, gen_cost, record_cost=True)
	theta_scaled, costs_scaled = gradDescent(X_mapping_scaled, Y, gen_cost, record_cost=True)

	not_scaled, = plt.plot(costs_normal[:,0], costs_normal[:,1], label='Not Scaled')
	scaled, = plt.plot(costs_scaled[:,0], costs_scaled[:,1], label='Scaled')
	plt.legend([not_scaled, scaled], ['Not scaled', 'Scaled'])
	plt.xlabel('Number of Iterations')
	plt.ylabel(r'$J(\theta)$')
	plt.savefig('learning-rate-scaled.svg')

	#==============================================================================
	# With or without Bias


	X = iris.data[iris.target != 2][:, [2, 3]]
	Y = iris.target[iris.target != 2]
	X_0, X_1 = X[Y==0], X[Y==1]

	def add_bias(X):
	bias = np.ones(X.shape[0])
	return np.insert(X, 0, bias, axis=1)

	theta_no_bias = gradDescent(X, Y, iterations=1000)
	predict_no_bias = lambda X: np.where(h(X, theta_no_bias)>=0.5, 1, 0)

	X_with_bias = add_bias(X)
	theta_with_bias = gradDescent(X_with_bias, Y, iterations=1000)
	predict_with_bias = lambda X: np.where(h(add_bias(X), theta_with_bias)>=0.5, 1, 0)

	x_min, x_max = -1, X[:,0].max() + 1
	y_min, y_max = X[:,1].min() - 1, X[:, 1].max() + 1
	xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.01), np.arange(y_min, y_max, 0.01))

	fig = plt.figure()
	CS = plot_contours(predict_no_bias, xx, yy, alpha=0.5)
	plt.clabel(CS, CS.levels[:2], fmt={CS.levels[1]: 'No Bias'}, inline=True, fontsize=16)
	CS = plot_contours(predict_with_bias, xx, yy, alpha=0.5)
	plt.clabel(CS, CS.levels[:2], fmt={CS.levels[1]: 'Bias'}, inline=True, fontsize=16)
	plt.scatter(X_0[:, 0], X_0[:, 1], c='r', alpha=0.8)
	plt.scatter(X_1[:, 0], X_1[:, 1], c='g', alpha=0.8)

	plt.plot(np.linspace(-1,0), np.zeros(len(np.linspace(-1,0))), 'c--')
	plt.plot(np.zeros(len(np.linspace(y_min,0))), np.linspace(y_min,0), 'c--')
	plt.show()