Skip to content

Instantly share code, notes, and snippets.

@RottenFruits
Created May 26, 2018 05:17
Show Gist options
  • Save RottenFruits/8af99ec638ee3cc3d4e2344fca51f3e4 to your computer and use it in GitHub Desktop.
Save RottenFruits/8af99ec638ee3cc3d4e2344fca51f3e4 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import numpy as np\n",
"import random"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# DMF"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"class DecomposedMatrixFactorization(object):\n",
"\n",
" def __init__(self, K=20, alpha_1=1e-6, alpha_2=1e-6, beta = 0.0):\n",
" self.K = K \n",
" self.alpha_1 = alpha_1\n",
" self.alpha_2 = alpha_2\n",
" self.beta = beta\n",
" \n",
" def fit(self, X, n_user, n_item, n_rate, n_iter = 100):\n",
" self.R = X.copy()\n",
" self.n_rate = n_rate\n",
" self.samples = []\n",
" self.user_factors = []\n",
" self.item_factors = []\n",
" self.w = []\n",
" \n",
" for i in range(self.n_rate):\n",
" tmp = X.copy()\n",
" tmp[:, 2] = (tmp[:, 2] >= (i+1)).astype(int)\n",
" self.samples.append(tmp)\n",
" self.user_factors.append(np.random.rand(n_user, self.K))\n",
" self.item_factors.append(np.random.rand(n_item, self.K))\n",
" self.w.append(np.ones(n_user))\n",
" \n",
" self.loss = []\n",
" for i in range(n_iter):\n",
" \n",
" for j in range(10):\n",
" for c in range(self.n_rate):\n",
" self.sgd(c)\n",
" \n",
" for k in range(10):\n",
" for c in range(self.n_rate):\n",
" self.sgd_user_scale(c)\n",
" \n",
" self.alpha_1 = self.alpha_1 / 2\n",
" self.alpha_2 = self.alpha_2 / 2\n",
" mse = self.mse()\n",
" self.loss.append((i, mse)) \n",
"\n",
" def sgd(self, c):\n",
" np.random.shuffle(self.samples[c])\n",
" for user, item, rating in self.samples[c]:\n",
" err = rating - self.predict_pair(user, item, c) \n",
" \n",
" # Update user and item\n",
" self.user_factors[c][user] += self.alpha_1 * (err * self.w[c][user] * self.item_factors[c][item] - 2 * self.beta * self.user_factors[c][user])\n",
" self.item_factors[c][item] += self.alpha_1 * (err * self.w[c][user] * self.user_factors[c][user] - 2 * self.beta * self.item_factors[c][item]) \n",
" self.user_factors[c][user] = self.user_factors[c][user] / np.sqrt(np.inner(self.user_factors[c][user], self.user_factors[c][user]))\n",
" self.item_factors[c][item] = self.item_factors[c][item] / np.sqrt(np.inner(self.item_factors[c][item], self.item_factors[c][item]))\n",
" \n",
" def sgd_user_scale(self, c):\n",
" np.random.shuffle(self.samples[c])\n",
" for user, item, rating in self.samples[c]:\n",
" err = rating - self.predict_pair(user, item, c)\n",
" #Update user scale\n",
" self.w[c][user] += self.alpha_2 * err * (np.inner(self.user_factors[c][user], self.item_factors[c][item]) + 1)\n",
" \n",
" def mse(self):\n",
" predicted = self.predict(self.R)\n",
" error = np.hstack((self.R, np.array(predicted).reshape(-1, 1)))\n",
" error = np.sqrt(pow((error[:, 2] - error[:, 3]), 2).mean())\n",
" \n",
" return error\n",
" \n",
" \n",
" def predict_pair(self, user, item, c):\n",
" return (np.inner(self.user_factors[c][user], self.item_factors[c][item]) + 1) / 2\n",
" \n",
" def predict(self, X):\n",
" rate = []\n",
" rate_tmp = np.zeros(self.n_rate)\n",
" for row in X:\n",
" for c in range(self.n_rate): \n",
" rate_tmp[c] = self.predict_pair(row[0], row[1], c)\n",
" rate.append(np.sum(rate_tmp))\n",
" return rate\n",
" \n",
" def get_full_matrix(self):\n",
" return np.inner(self.user_factors, self.item_factors)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# load data"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import pandas as pd\n",
"\n",
"def load_ml100k():\n",
" samples = pd.read_csv('data/ml-100k/u.data', sep = '\\t', header=None)\n",
" \n",
" samples = samples.iloc[:, :3]\n",
" samples.columns = ['user', 'item', 'rate']\n",
" \n",
" samples['user'] = samples['user'] - 1\n",
" samples['item'] = samples['item'] - 1\n",
" \n",
" return samples"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# main"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"df = np.array(load_ml100k())\n",
"\n",
"n_user = np.unique(df[:, 0]).max() + 1\n",
"n_item = np.unique(df[:, 1]).max() + 1\n",
"n_rate = np.unique(df[:, 2]).max()\n",
"\n",
"random.shuffle(df)\n",
"train_size = int(df.shape[0] * 0.8)\n",
"train_df = df[:train_size]\n",
"test_df = df[train_size:]"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"0.99228762189688302"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"DMF = DecomposedMatrixFactorization(K = 20, alpha_1 = 0.01, alpha_2 = 0.001, beta = 0.5)\n",
"DMF.fit(train_df, n_user, n_item, n_rate, n_iter = 2)\n",
"\n",
"pre3 = DMF.predict(test_df)\n",
"ret3 = np.hstack((test_df, np.array(pre3).reshape(-1, 1)))\n",
"np.sqrt(pow((ret3[:, 2] - ret3[:, 3]), 2).mean())"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.4"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment