Created
April 4, 2015 04:21
-
-
Save threecourse/c06bb743662a6e4aae47 to your computer and use it in GitHub Desktop.
firstpattern recognition R example 4.4
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import numpy as np | |
import pandas as pd | |
import matplotlib.pyplot as plt | |
from sklearn import datasets | |
import urllib | |
# ピマインディアンデータセット --------------------- | |
# http://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.names | |
# URL for the Pima Indians Diabetes dataset (UCI Machine Learning Repository) | |
url = "http://goo.gl/j0Rvxq" | |
# download the file | |
raw_data = urllib.urlopen(url) | |
# load the CSV file as a numpy matrix | |
dataset = np.loadtxt(raw_data, delimiter=",") | |
print "pima dataset shape", dataset.shape | |
df = pd.DataFrame(data = dataset[:,[1,5,8]], columns = ["glu", "bmi", "type"]) | |
df = df[ (df["glu"] > 0) & (df["bmi"] > 0) ] | |
data = df[["glu", "bmi"]] | |
data0 = data[df["type"] == 0] | |
data1 = data[df["type"] == 1] | |
# 2次識別境界テスト用データセット --------------------- | |
def determine_class(x, y): | |
factors = [3.0, 5.0, 3.0, -2.0, 0.5, -3.0] | |
params = np.array([x * x, x * y, y * y ,x ,y, 1.0]) | |
val = (np.array(factors) * np.array(params)).sum() | |
if val < 0.0 : return 0 | |
return 1 | |
np.random.seed(1234) | |
xs = np.random.normal(size=1000) | |
ys = np.random.normal(size=1000) | |
types = [determine_class(x,y) for (x,y) in zip(xs, ys)] | |
df2 = pd.DataFrame(data = np.array([xs, ys, types]).T, columns = ["x", "y", "t"]) | |
data2 = df2[["x", "y"]] | |
data2_0 = data2[df2["t"] == 0] | |
data2_1 = data2[df2["t"] == 1] | |
# 識別器 --------------------- | |
class base_classfier(object): | |
u"識別器ベースクラス" | |
def xT_m_x(s, x_ary, m): | |
x = s.vv(x_ary) # vertical vector | |
return np.dot(np.dot(x.T, m), x) | |
def setdata(s, data0, data1): | |
s.data0 = data0 | |
s.data1 = data1 | |
s.cov0 = np.cov(data0.T) | |
s.cov1 = np.cov(data1.T) | |
s.n0 = len(data0) | |
s.n1 = len(data1) | |
s.mu0 = data0.mean() | |
s.mu1 = data1.mean() | |
def vv(s, ary): | |
u"縦ベクトルへの変換" | |
return np.matrix(ary).T | |
class linear_classfier(base_classfier): | |
u"線形識別境界識別器" | |
def fit(s, data0, data1): | |
s.setdata(data0, data1) | |
s.covc = (s.cov0 * s.n0 + s.cov1 * s.n1) / (s.n0 + s.n1) | |
s.covcinv = np.linalg.inv(s.covc) | |
s.cT = np.dot(s.mu1 - s.mu0, s.covcinv) | |
s.F = ( s.xT_m_x(s.mu0, s.covcinv) | |
- s.xT_m_x(s.mu1, s.covcinv) | |
- 2 * np.log(float(s.n0) / s.n1)) | |
def predict_row(s, series): | |
return (2 * np.dot(s.cT, s.vv(series)) + s.F)[0, 0] | |
def predict(s, data): | |
p = data.apply(s.predict_row, axis = 1) | |
p = np.where(p < 0.0, 0, 1) | |
return p | |
class quadratic_classifier(base_classfier): | |
u"2次識別境界識別器" | |
def fit(s, data0, data1): | |
s.setdata(data0, data1) | |
s.cov0inv = np.linalg.inv(s.cov0) | |
s.cov1inv = np.linalg.inv(s.cov1) | |
s.S = s.cov0inv - s.cov1inv | |
s.cT = np.dot(s.mu1, s.cov1inv) - np.dot(s.mu0, s.cov0inv) | |
s.F = ( s.xT_m_x(s.mu0, s.cov0inv) | |
- s.xT_m_x(s.mu1, s.cov1inv) | |
+ np.log( np.linalg.det(s.cov0) / np.linalg.det(s.cov1) ) | |
- 2 * np.log(float(s.n0) / s.n1) | |
) | |
def predict_row(s, series): | |
return (s.xT_m_x(series, s.S) + 2.0 * np.dot(s.cT, s.vv(series)) + s.F)[0, 0] | |
def predict(s, data): | |
p = data.apply(s.predict_row, axis = 1) | |
p = np.where(p < 0.0, 0, 1) | |
return p | |
# ユーティリティ --------------------- | |
def customized_plot(df, col_x, col_y, predict): | |
plt.scatter(df[col_x], df[col_y], c=predict, cmap=plt.cm.rainbow_r) | |
plt.xlabel(col_x) | |
plt.ylabel(col_y) | |
plt.plot() | |
#plt.show() | |
# ピマインディアンデータセットの分類 --------------------- | |
plt.clf() | |
fig = plt.figure(1) | |
lc = linear_classfier() | |
lc.fit(data0, data1) | |
p1 = lc.predict(data) | |
qc = quadratic_classifier() | |
qc.fit(data0, data1) | |
p2 = qc.predict(data) | |
plt.subplot(2, 2, 1) | |
plt.title('data') | |
customized_plot(df, "glu", "bmi", df["type"]) | |
plt.subplot(2, 2, 3) | |
plt.title('linear classfier') | |
customized_plot(df, "glu", "bmi", p1) | |
plt.subplot(2, 2, 4) | |
plt.title('quadratic classifier') | |
customized_plot(df, "glu", "bmi", p2) | |
plt.show() | |
fig.savefig('pima_classify.png') | |
# 2次識別境界テスト用データセットの分類 --------------------- | |
plt.clf() | |
fig = plt.figure(1) | |
lc2 = linear_classfier() | |
lc2.fit(data2_0, data2_1) | |
p2_1 = lc2.predict(data2) | |
qc2 = quadratic_classifier() | |
qc2.fit(data2_0, data2_1) | |
p2_2 = qc2.predict(data2) | |
plt.subplot(2, 2, 1) | |
plt.title('data') | |
customized_plot(df2, "x", "y", df2["t"]) | |
plt.subplot(2, 2, 3) | |
plt.title('linear classfier') | |
customized_plot(df2, "x", "y", p2_1) | |
plt.subplot(2, 2, 4) | |
plt.title('quadratic classifier') | |
customized_plot(df2, "x", "y", p2_2) | |
plt.show() | |
fig.savefig('testdataset_classify.png') | |
print "finished" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment