threecourse/firstpattern4_4.py

## firstpattern4_4.py
# -*- coding: utf-8 -*-
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import datasets
import urllib

# ピマインディアンデータセット ---------------------
# http://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.names
# URL for the Pima Indians Diabetes dataset (UCI Machine Learning Repository)
url = "http://goo.gl/j0Rvxq"
# download the file
raw_data = urllib.urlopen(url)
# load the CSV file as a numpy matrix
dataset = np.loadtxt(raw_data, delimiter=",")
print "pima dataset shape", dataset.shape

df = pd.DataFrame(data = dataset[:,[1,5,8]], columns = ["glu", "bmi", "type"])
df = df[ (df["glu"] > 0) & (df["bmi"] > 0) ]

data = df[["glu", "bmi"]]
data0 = data[df["type"] == 0]
data1 = data[df["type"] == 1]

# 2次識別境界テスト用データセット ---------------------
def determine_class(x, y):
    factors = [3.0, 5.0, 3.0, -2.0, 0.5, -3.0]
    params = np.array([x * x, x * y, y * y ,x ,y, 1.0])
    val = (np.array(factors) * np.array(params)).sum()
    if val < 0.0 : return 0
    return 1

np.random.seed(1234)
xs = np.random.normal(size=1000)
ys = np.random.normal(size=1000)
types = [determine_class(x,y) for (x,y) in zip(xs, ys)]

df2 = pd.DataFrame(data = np.array([xs, ys, types]).T, columns = ["x", "y", "t"])
data2 = df2[["x", "y"]]
data2_0 = data2[df2["t"] == 0]
data2_1 = data2[df2["t"] == 1]

# 識別器 ---------------------

class base_classfier(object):
    u"識別器ベースクラス"
    def xT_m_x(s, x_ary, m):
        x = s.vv(x_ary) # vertical vector
        return np.dot(np.dot(x.T, m), x)

    def setdata(s, data0, data1):
        s.data0 = data0
        s.data1 = data1
        s.cov0 = np.cov(data0.T)
        s.cov1 = np.cov(data1.T)
        s.n0 = len(data0)
        s.n1 = len(data1)
        s.mu0 = data0.mean()
        s.mu1 = data1.mean()

    def vv(s, ary):
        u"縦ベクトルへの変換"
        return np.matrix(ary).T

class linear_classfier(base_classfier):
    u"線形識別境界識別器"

    def fit(s, data0, data1):
        s.setdata(data0, data1)

        s.covc = (s.cov0 * s.n0 + s.cov1 * s.n1) / (s.n0 + s.n1)
        s.covcinv = np.linalg.inv(s.covc)
        s.cT = np.dot(s.mu1 - s.mu0, s.covcinv)
        s.F = (   s.xT_m_x(s.mu0, s.covcinv)
                - s.xT_m_x(s.mu1, s.covcinv)
                - 2 * np.log(float(s.n0) / s.n1))

    def predict_row(s, series):
        return (2 * np.dot(s.cT, s.vv(series)) + s.F)[0, 0]

    def predict(s, data):
        p = data.apply(s.predict_row, axis = 1)
        p = np.where(p < 0.0, 0, 1)
        return p


class quadratic_classifier(base_classfier):
    u"2次識別境界識別器"

    def fit(s, data0, data1):
        s.setdata(data0, data1)

        s.cov0inv = np.linalg.inv(s.cov0)
        s.cov1inv = np.linalg.inv(s.cov1)
        s.S = s.cov0inv - s.cov1inv
        s.cT = np.dot(s.mu1, s.cov1inv) - np.dot(s.mu0, s.cov0inv)
        s.F = (   s.xT_m_x(s.mu0, s.cov0inv)
                - s.xT_m_x(s.mu1, s.cov1inv)
                + np.log( np.linalg.det(s.cov0) / np.linalg.det(s.cov1) )
                - 2 * np.log(float(s.n0) / s.n1)
               )

    def predict_row(s, series):
        return (s.xT_m_x(series, s.S) + 2.0 * np.dot(s.cT, s.vv(series)) + s.F)[0, 0]

    def predict(s, data):
        p = data.apply(s.predict_row, axis = 1)
        p = np.where(p < 0.0, 0, 1)
        return p

# ユーティリティ ---------------------
def customized_plot(df, col_x, col_y, predict):
    plt.scatter(df[col_x], df[col_y], c=predict, cmap=plt.cm.rainbow_r)
    plt.xlabel(col_x)
    plt.ylabel(col_y)
    plt.plot()
    #plt.show()

# ピマインディアンデータセットの分類 ---------------------
plt.clf()
fig = plt.figure(1)

lc = linear_classfier()
lc.fit(data0, data1)
p1 = lc.predict(data)

qc = quadratic_classifier()
qc.fit(data0, data1)
p2 = qc.predict(data)

plt.subplot(2, 2, 1)
plt.title('data')
customized_plot(df, "glu", "bmi", df["type"])

plt.subplot(2, 2, 3)
plt.title('linear classfier')
customized_plot(df, "glu", "bmi", p1)

plt.subplot(2, 2, 4)
plt.title('quadratic classifier')
customized_plot(df, "glu", "bmi", p2)

plt.show()
fig.savefig('pima_classify.png')

# 2次識別境界テスト用データセットの分類 ---------------------
plt.clf()
fig = plt.figure(1)

lc2 = linear_classfier()
lc2.fit(data2_0, data2_1)
p2_1 = lc2.predict(data2)

qc2 = quadratic_classifier()
qc2.fit(data2_0, data2_1)
p2_2 = qc2.predict(data2)

plt.subplot(2, 2, 1)
plt.title('data')
customized_plot(df2, "x", "y", df2["t"])

plt.subplot(2, 2, 3)
plt.title('linear classfier')
customized_plot(df2, "x", "y", p2_1)

plt.subplot(2, 2, 4)
plt.title('quadratic classifier')
customized_plot(df2, "x", "y", p2_2)

plt.show()
fig.savefig('testdataset_classify.png')


print "finished"
	# -- coding: utf-8 --
	import numpy as np
	import pandas as pd
	import matplotlib.pyplot as plt
	from sklearn import datasets
	import urllib

	# ピマインディアンデータセット ---------------------
	# http://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.names
	# URL for the Pima Indians Diabetes dataset (UCI Machine Learning Repository)
	url = "http://goo.gl/j0Rvxq"
	# download the file
	raw_data = urllib.urlopen(url)
	# load the CSV file as a numpy matrix
	dataset = np.loadtxt(raw_data, delimiter=",")
	print "pima dataset shape", dataset.shape

	df = pd.DataFrame(data = dataset[:,[1,5,8]], columns = ["glu", "bmi", "type"])
	df = df[ (df["glu"] > 0) & (df["bmi"] > 0) ]

	data = df[["glu", "bmi"]]
	data0 = data[df["type"] == 0]
	data1 = data[df["type"] == 1]

	# 2次識別境界テスト用データセット ---------------------
	def determine_class(x, y):
	factors = [3.0, 5.0, 3.0, -2.0, 0.5, -3.0]
	params = np.array([x * x, x * y, y * y ,x ,y, 1.0])
	val = (np.array(factors) * np.array(params)).sum()
	if val < 0.0 : return 0
	return 1

	np.random.seed(1234)
	xs = np.random.normal(size=1000)
	ys = np.random.normal(size=1000)
	types = [determine_class(x,y) for (x,y) in zip(xs, ys)]

	df2 = pd.DataFrame(data = np.array([xs, ys, types]).T, columns = ["x", "y", "t"])
	data2 = df2[["x", "y"]]
	data2_0 = data2[df2["t"] == 0]
	data2_1 = data2[df2["t"] == 1]

	# 識別器 ---------------------

	class base_classfier(object):
	u"識別器ベースクラス"
	def xT_m_x(s, x_ary, m):
	x = s.vv(x_ary) # vertical vector
	return np.dot(np.dot(x.T, m), x)

	def setdata(s, data0, data1):
	s.data0 = data0
	s.data1 = data1
	s.cov0 = np.cov(data0.T)
	s.cov1 = np.cov(data1.T)
	s.n0 = len(data0)
	s.n1 = len(data1)
	s.mu0 = data0.mean()
	s.mu1 = data1.mean()

	def vv(s, ary):
	u"縦ベクトルへの変換"
	return np.matrix(ary).T

	class linear_classfier(base_classfier):
	u"線形識別境界識別器"

	def fit(s, data0, data1):
	s.setdata(data0, data1)

	s.covc = (s.cov0 * s.n0 + s.cov1 * s.n1) / (s.n0 + s.n1)
	s.covcinv = np.linalg.inv(s.covc)
	s.cT = np.dot(s.mu1 - s.mu0, s.covcinv)
	s.F = ( s.xT_m_x(s.mu0, s.covcinv)
	- s.xT_m_x(s.mu1, s.covcinv)
	- 2 * np.log(float(s.n0) / s.n1))

	def predict_row(s, series):
	return (2 * np.dot(s.cT, s.vv(series)) + s.F)[0, 0]

	def predict(s, data):
	p = data.apply(s.predict_row, axis = 1)
	p = np.where(p < 0.0, 0, 1)
	return p


	class quadratic_classifier(base_classfier):
	u"2次識別境界識別器"

	def fit(s, data0, data1):
	s.setdata(data0, data1)

	s.cov0inv = np.linalg.inv(s.cov0)
	s.cov1inv = np.linalg.inv(s.cov1)
	s.S = s.cov0inv - s.cov1inv
	s.cT = np.dot(s.mu1, s.cov1inv) - np.dot(s.mu0, s.cov0inv)
	s.F = ( s.xT_m_x(s.mu0, s.cov0inv)
	- s.xT_m_x(s.mu1, s.cov1inv)
	+ np.log( np.linalg.det(s.cov0) / np.linalg.det(s.cov1) )
	- 2 * np.log(float(s.n0) / s.n1)
	)

	def predict_row(s, series):
	return (s.xT_m_x(series, s.S) + 2.0 * np.dot(s.cT, s.vv(series)) + s.F)[0, 0]

	def predict(s, data):
	p = data.apply(s.predict_row, axis = 1)
	p = np.where(p < 0.0, 0, 1)
	return p

	# ユーティリティ ---------------------
	def customized_plot(df, col_x, col_y, predict):
	plt.scatter(df[col_x], df[col_y], c=predict, cmap=plt.cm.rainbow_r)
	plt.xlabel(col_x)
	plt.ylabel(col_y)
	plt.plot()
	#plt.show()

	# ピマインディアンデータセットの分類 ---------------------
	plt.clf()
	fig = plt.figure(1)

	lc = linear_classfier()
	lc.fit(data0, data1)
	p1 = lc.predict(data)

	qc = quadratic_classifier()
	qc.fit(data0, data1)
	p2 = qc.predict(data)

	plt.subplot(2, 2, 1)
	plt.title('data')
	customized_plot(df, "glu", "bmi", df["type"])

	plt.subplot(2, 2, 3)
	plt.title('linear classfier')
	customized_plot(df, "glu", "bmi", p1)

	plt.subplot(2, 2, 4)
	plt.title('quadratic classifier')
	customized_plot(df, "glu", "bmi", p2)

	plt.show()
	fig.savefig('pima_classify.png')

	# 2次識別境界テスト用データセットの分類 ---------------------
	plt.clf()
	fig = plt.figure(1)

	lc2 = linear_classfier()
	lc2.fit(data2_0, data2_1)
	p2_1 = lc2.predict(data2)

	qc2 = quadratic_classifier()
	qc2.fit(data2_0, data2_1)
	p2_2 = qc2.predict(data2)

	plt.subplot(2, 2, 1)
	plt.title('data')
	customized_plot(df2, "x", "y", df2["t"])

	plt.subplot(2, 2, 3)
	plt.title('linear classfier')
	customized_plot(df2, "x", "y", p2_1)

	plt.subplot(2, 2, 4)
	plt.title('quadratic classifier')
	customized_plot(df2, "x", "y", p2_2)

	plt.show()
	fig.savefig('testdataset_classify.png')


	print "finished"