Created
May 13, 2018 01:13
-
-
Save koshian2/3012c9973a0a1bf664a62f12deff2349 to your computer and use it in GitHub Desktop.
Coursera Machine LearningをPythonで実装 - [Week9]異常検知、協調フィルタリング (2)協調フィルタリング
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import matplotlib.pyplot as plt | |
from scipy.io import loadmat | |
## 1.映画のレーティングのデータの読み込み | |
def load_movies(): | |
data = loadmat("ex8_movies.mat") | |
return np.array(data['Y']), np.array(data['R']) | |
Y, R = load_movies() | |
# 1番目の映画の平均レート | |
print("Average rating for movie 1 (Toy Story):", np.mean(Y[0, R[0, :]]), "/ 5\n") | |
# データの可視化 | |
plt.imshow(Y) | |
plt.xlabel("Users") | |
plt.ylabel("Movies") | |
plt.show() | |
## 2.協調フィルタリングのコスト関数 | |
# 事前に訓練済みの値を読み込み | |
def load_movie_params(): | |
data = loadmat("ex8_movieParams.mat") | |
return np.array(data['X']), np.array(data['Theta']), data['num_users'], data['num_movies'], data['num_features'] | |
X, Theta, num_users, num_movies, num_features = load_movie_params() | |
# コスト関数を速く走らせるために一旦値を減らす(これでX,Y,Rがどんな構造かわかるはず) | |
num_users, num_movies, num_features = 4, 5, 3 | |
X = X[:num_movies, :num_features] | |
Theta = Theta[:num_users, :num_features] | |
Y = Y[:num_movies, :num_users] | |
R = R[:num_movies, :num_users] | |
# コスト関数 | |
def confi_costfunc(param, Y, R, lambda_): | |
# 最適化用にparam=[X, Theta]としておく | |
X, Theta = param | |
# 推定値との誤差(R[i,j]==1つまり、レートが与えられているもののみ計算するのが重要) | |
diffs = (np.dot(X, Theta.T) - Y) * R | |
# コスト関数 | |
J = np.sum(diffs ** 2) / 2 + lambda_ / 2 * (np.sum(Theta ** 2) + np.sum(X ** 2)) | |
# 勾配 | |
X_grad = np.dot(diffs, Theta) + lambda_ * X | |
Theta_grad = np.dot(diffs.T, X) + lambda_ * Theta | |
grad = [X_grad, Theta_grad] | |
return J, grad | |
# コスト関数のテスト | |
J, grad = confi_costfunc([X, Theta], Y, R, 0) | |
print("Cost at loaded parameters:", J) | |
print("(this value should be about 22.22)\n") | |
## 3.協調フィルタリングのコスト関数の勾配(Gradient Checking) | |
# Gradient Checking用の関数 | |
def compute_numerical_gradient(Jonly_func, param): | |
# コストのみ返す関数をJonly_funcにラップする、paramは[X, Theta]のように配置する | |
# paramが3変数以上になっても対応できているはず | |
shapes = [x.shape for x in param] | |
# 行列の要素数の累積和 | |
param_size_cumsum = np.append(0, np.cumsum([np.size(x) for x in param])) | |
# ベクトルとして格納 | |
numgrad, pertub = np.zeros(param_size_cumsum[-1]), np.zeros(param_size_cumsum[-1]) | |
e = 1e-4 | |
for p in range(param_size_cumsum[-1]): | |
pertub[p] = e | |
t1, t2 = [], [] | |
for i in range(len(param)): | |
delta_e = pertub[param_size_cumsum[i]:param_size_cumsum[i+1]].reshape(shapes[i]) | |
t1.append(param[i] - delta_e) | |
t2.append(param[i] + delta_e) | |
loss1, loss2 = Jonly_func(t1), Jonly_func(t2) | |
numgrad[p] = (loss2 - loss1) / (2*e) | |
pertub[p] = 0 | |
# 勾配を配列に戻す | |
grad = [] | |
for i in range(len(param)): | |
grad.append(numgrad[param_size_cumsum[i]:param_size_cumsum[i+1]].reshape(shapes[i])) | |
return grad | |
def check_cost_function(lambda_): | |
# 小さい問題を作る | |
X_t = np.random.rand(4, 3) | |
Theta_t = np.random.rand(5, 3) | |
Y = np.dot(X_t, Theta_t.T) | |
Y[np.random.rand(*Y.shape) > 0.5] = 0 | |
R = np.zeros(Y.shape) | |
R[Y != 0] = 1 | |
# Gradient Checkingを実行 | |
X = np.random.standard_normal(X_t.shape) | |
Theta = np.random.standard_normal(Theta_t.shape) | |
wrap = lambda P : confi_costfunc(P, Y, R, lambda_)[0] | |
numgrad = compute_numerical_gradient(wrap, [X, Theta]) | |
cost, grad = confi_costfunc([X, Theta], Y, R, lambda_) | |
# 比較 | |
diff_m, diff_p = 0, 0 | |
for n, g in zip(numgrad, grad): | |
for nn, gg in zip(np.ravel(n), np.ravel(g)): | |
diff_m += (nn - gg) ** 2 | |
diff_p += (nn + gg) ** 2 | |
print(nn, gg) | |
print("The above two columns you get should be very similar.") | |
print("(Left-Your Numerical Gradient, Right-Analytical Gradient)\n") | |
diff = np.sqrt(diff_m / diff_p) | |
print("If your cost function implementation is correct, then") | |
print("the relative difference will be small (less than 1e-9). ") | |
print("Relative Difference:", diff, "\n") | |
## 4.正則化ありの協調フィルタリングのコスト関数 | |
# コスト関数のテスト | |
J, grad = confi_costfunc([X, Theta], Y, R, 1.5) | |
print("Cost at loaded parameters (lambda = 1.5):", J) | |
print("(this value should be about 31.34)\n") | |
## 5. GraidientChecking | |
check_cost_function(1.5) | |
## 6.新しいユーザーのレーティングを入力 | |
# 映画のタイトルをロード | |
with open("movie_ids.txt", "rb") as fp: | |
# テキストのデコードがうまくいかないのでバイト単位で読み込む | |
movies_list = fp.readlines() | |
# 自分のレーティングを作る | |
my_ratings = np.zeros(1682) | |
# ID=0(トイ・ストーリー[1995]は4とする) | |
my_ratings[0] = 4 | |
# ID=97(羊たちの沈黙[1991]は2とする) | |
my_ratings[97] = 2 | |
# 同様にレーティングをつける | |
my_ratings[[6,11,53,63,65,68,182,225,354]]=3,5,4,5,3,5,4,5,5 | |
print("New user ratings:") | |
for i in range(len(my_ratings)): | |
if my_ratings[i] > 0: | |
print("Rated", my_ratings[i], "for", movies_list[i].decode(), end="") | |
## 7.映画のレーティングを学習 | |
# データのロード | |
Y, R = load_movies() | |
# 新しいユーザーのレーティングを追加 | |
Y = np.c_[my_ratings, Y] | |
R = np.c_[(my_ratings != 0).astype(int), R] | |
# レーティングを平均化(未評価の作品に対して平均値が推定されるようにするため) | |
def normalize_ratings(Y, R): | |
m, n = Y.shape | |
Ymean = np.zeros(m) | |
Ynorm = np.zeros(Y.shape) | |
for i in range(m): | |
idx = R[i, :] == 1 | |
Ymean[i] = np.mean(Y[i, idx]) | |
Ynorm[i, idx] = Y[i, idx] - Ymean[i] | |
return Ynorm, Ymean | |
Ynorm, Ymean = normalize_ratings(Y, R) | |
# 定数 | |
num_users = Y.shape[1] | |
num_movies = Y.shape[0] | |
num_features = 10 | |
# 初期値 | |
X = np.random.standard_normal((num_movies, num_features)) | |
Theta = np.random.standard_normal((num_users, num_features)) | |
lambda_ = 10 | |
# 最急降下法 | |
import sys, time, copy | |
def gradient_descent(initial_params, cost_wrap, eta, maxiter = 100): | |
theta_before = initial_params | |
for i in range(maxiter): | |
J, grad = cost_warp(theta_before) | |
theta = copy.deepcopy(theta_before) | |
for j in range(len(initial_params)): | |
theta[j] = theta[j] - eta * grad[j] | |
mes = f"iter = {i}, J = {J}" | |
theta_before = theta | |
# コンソールを埋め尽くさないように上書きしながら表示 | |
sys.stdout.write("\r%s" % mes) | |
sys.stdout.flush() | |
return theta, J | |
# 最小化するコスト関数 | |
cost_warp = lambda P:confi_costfunc(P, Ynorm, R, lambda_) | |
# 最急降下法の実行 | |
params, J = gradient_descent([X, Theta], cost_warp, 0.005, maxiter=1000) | |
# 結果の格納 | |
X, Theta = params | |
## 8.レコメンデーション | |
p = np.dot(X, Theta.T) | |
my_predictions = p[:, 0] + Ymean | |
# おすすめ順に降順ソート | |
idx = np.argsort(my_predictions)[::-1] | |
print("\n\nTop recommendations for you:") | |
for i in range(10): | |
print("Predicting rating", np.round(my_predictions[idx[i]], 2), "1f for movie", movies_list[idx[i]].decode(), end="") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment