Skip to content

Instantly share code, notes, and snippets.

@threecourse
Created April 4, 2015 04:21
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save threecourse/c06bb743662a6e4aae47 to your computer and use it in GitHub Desktop.
Save threecourse/c06bb743662a6e4aae47 to your computer and use it in GitHub Desktop.
firstpattern recognition R example 4.4
# -*- coding: utf-8 -*-
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import datasets
import urllib
# ピマインディアンデータセット ---------------------
# http://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.names
# URL for the Pima Indians Diabetes dataset (UCI Machine Learning Repository)
url = "http://goo.gl/j0Rvxq"
# download the file
raw_data = urllib.urlopen(url)
# load the CSV file as a numpy matrix
dataset = np.loadtxt(raw_data, delimiter=",")
print "pima dataset shape", dataset.shape
df = pd.DataFrame(data = dataset[:,[1,5,8]], columns = ["glu", "bmi", "type"])
df = df[ (df["glu"] > 0) & (df["bmi"] > 0) ]
data = df[["glu", "bmi"]]
data0 = data[df["type"] == 0]
data1 = data[df["type"] == 1]
# 2次識別境界テスト用データセット ---------------------
def determine_class(x, y):
factors = [3.0, 5.0, 3.0, -2.0, 0.5, -3.0]
params = np.array([x * x, x * y, y * y ,x ,y, 1.0])
val = (np.array(factors) * np.array(params)).sum()
if val < 0.0 : return 0
return 1
np.random.seed(1234)
xs = np.random.normal(size=1000)
ys = np.random.normal(size=1000)
types = [determine_class(x,y) for (x,y) in zip(xs, ys)]
df2 = pd.DataFrame(data = np.array([xs, ys, types]).T, columns = ["x", "y", "t"])
data2 = df2[["x", "y"]]
data2_0 = data2[df2["t"] == 0]
data2_1 = data2[df2["t"] == 1]
# 識別器 ---------------------
class base_classfier(object):
u"識別器ベースクラス"
def xT_m_x(s, x_ary, m):
x = s.vv(x_ary) # vertical vector
return np.dot(np.dot(x.T, m), x)
def setdata(s, data0, data1):
s.data0 = data0
s.data1 = data1
s.cov0 = np.cov(data0.T)
s.cov1 = np.cov(data1.T)
s.n0 = len(data0)
s.n1 = len(data1)
s.mu0 = data0.mean()
s.mu1 = data1.mean()
def vv(s, ary):
u"縦ベクトルへの変換"
return np.matrix(ary).T
class linear_classfier(base_classfier):
u"線形識別境界識別器"
def fit(s, data0, data1):
s.setdata(data0, data1)
s.covc = (s.cov0 * s.n0 + s.cov1 * s.n1) / (s.n0 + s.n1)
s.covcinv = np.linalg.inv(s.covc)
s.cT = np.dot(s.mu1 - s.mu0, s.covcinv)
s.F = ( s.xT_m_x(s.mu0, s.covcinv)
- s.xT_m_x(s.mu1, s.covcinv)
- 2 * np.log(float(s.n0) / s.n1))
def predict_row(s, series):
return (2 * np.dot(s.cT, s.vv(series)) + s.F)[0, 0]
def predict(s, data):
p = data.apply(s.predict_row, axis = 1)
p = np.where(p < 0.0, 0, 1)
return p
class quadratic_classifier(base_classfier):
u"2次識別境界識別器"
def fit(s, data0, data1):
s.setdata(data0, data1)
s.cov0inv = np.linalg.inv(s.cov0)
s.cov1inv = np.linalg.inv(s.cov1)
s.S = s.cov0inv - s.cov1inv
s.cT = np.dot(s.mu1, s.cov1inv) - np.dot(s.mu0, s.cov0inv)
s.F = ( s.xT_m_x(s.mu0, s.cov0inv)
- s.xT_m_x(s.mu1, s.cov1inv)
+ np.log( np.linalg.det(s.cov0) / np.linalg.det(s.cov1) )
- 2 * np.log(float(s.n0) / s.n1)
)
def predict_row(s, series):
return (s.xT_m_x(series, s.S) + 2.0 * np.dot(s.cT, s.vv(series)) + s.F)[0, 0]
def predict(s, data):
p = data.apply(s.predict_row, axis = 1)
p = np.where(p < 0.0, 0, 1)
return p
# ユーティリティ ---------------------
def customized_plot(df, col_x, col_y, predict):
plt.scatter(df[col_x], df[col_y], c=predict, cmap=plt.cm.rainbow_r)
plt.xlabel(col_x)
plt.ylabel(col_y)
plt.plot()
#plt.show()
# ピマインディアンデータセットの分類 ---------------------
plt.clf()
fig = plt.figure(1)
lc = linear_classfier()
lc.fit(data0, data1)
p1 = lc.predict(data)
qc = quadratic_classifier()
qc.fit(data0, data1)
p2 = qc.predict(data)
plt.subplot(2, 2, 1)
plt.title('data')
customized_plot(df, "glu", "bmi", df["type"])
plt.subplot(2, 2, 3)
plt.title('linear classfier')
customized_plot(df, "glu", "bmi", p1)
plt.subplot(2, 2, 4)
plt.title('quadratic classifier')
customized_plot(df, "glu", "bmi", p2)
plt.show()
fig.savefig('pima_classify.png')
# 2次識別境界テスト用データセットの分類 ---------------------
plt.clf()
fig = plt.figure(1)
lc2 = linear_classfier()
lc2.fit(data2_0, data2_1)
p2_1 = lc2.predict(data2)
qc2 = quadratic_classifier()
qc2.fit(data2_0, data2_1)
p2_2 = qc2.predict(data2)
plt.subplot(2, 2, 1)
plt.title('data')
customized_plot(df2, "x", "y", df2["t"])
plt.subplot(2, 2, 3)
plt.title('linear classfier')
customized_plot(df2, "x", "y", p2_1)
plt.subplot(2, 2, 4)
plt.title('quadratic classifier')
customized_plot(df2, "x", "y", p2_2)
plt.show()
fig.savefig('testdataset_classify.png')
print "finished"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment