PomoML/lesson5-movielens-NN Only.ipynb

## lesson5-movielens-NN Only.ipynb
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "heading_collapsed": true
   },
   "source": [
    "## Movielens"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "%reload_ext autoreload\n",
    "%autoreload 2\n",
    "%matplotlib inline\n",
    "\n",
    "from fastai.learner import *\n",
    "from fastai.column_data import *"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hidden": true
   },
   "source": [
    "Data available from http://files.grouplens.org/datasets/movielens/ml-latest-small.zip"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "path='data/ml-latest-small/'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hidden": true
   },
   "source": [
    "We're working with the movielens data, which contains one rating per row, like this:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "ratings = pd.read_csv(path+'ratings.csv')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "heading_collapsed": true
   },
   "source": [
    "## Collaborative filtering"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "val_idxs = get_cv_idxs(len(ratings))\n",
    "n_factors = 50"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "#cf = CollabFilterDataset.from_csv(path, 'ratings.csv', 'userId', 'movieId', 'rating')\n",
    "#learn = cf.get_learner(n_factors, val_idxs, 64, opt_fn=optim.Adam)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "u_uniq = ratings.userId.unique()\n",
    "user2idx = {o:i for i,o in enumerate(u_uniq)}\n",
    "ratings.userId = ratings.userId.apply(lambda x: user2idx[x])\n",
    "\n",
    "m_uniq = ratings.movieId.unique()\n",
    "movie2idx = {o:i for i,o in enumerate(m_uniq)}\n",
    "ratings.movieId = ratings.movieId.apply(lambda x: movie2idx[x])\n",
    "\n",
    "n_users=int(ratings.userId.nunique())\n",
    "n_movies=int(ratings.movieId.nunique())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "x = ratings.drop(['rating', 'timestamp'],axis=1)\n",
    "y = ratings['rating'].astype(np.float32)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "data = ColumnarModelData.from_data_frame(path, val_idxs, x, y, ['userId', 'movieId'], 64)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(0.5, 5.0)"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "min_rating,max_rating = ratings.rating.min().item(),ratings.rating.max().item()\n",
    "min_rating,max_rating"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "#next(iter(data.trn_dl))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Mini net"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "code_folding": []
   },
   "outputs": [],
   "source": [
    "kValue = -1\n",
    "class EmbeddingNet(nn.Module):\n",
    "    def __init__(self, n_users, n_movies, nh=10, p1=0.05, p2=0.5):\n",
    "        super().__init__()\n",
    "        (self.u, self.m) = [get_emb(*o) for o in [\n",
    "            (n_users, n_factors), (n_movies, n_factors)]]\n",
    "        self.lin1 = nn.Linear(n_factors*2, nh)\n",
    "        self.lin2 = nn.Linear(nh, 1)\n",
    "        self.drop1 = nn.Dropout(p1)\n",
    "        self.drop2 = nn.Dropout(p2)\n",
    "        self.kv = nn.Parameter(torch.FloatTensor(1).fill_(.5), requires_grad=True).cuda() #set k initial value\n",
    "        \n",
    "    def sigscale1(self,y,k):\n",
    "        return F.sigmoid(y) * (max_rating-min_rating+2*k) + min_rating-k\n",
    "    \n",
    "    def forward(self, cats, conts):\n",
    "        global kValue\n",
    "        users,movies = cats[:,0],cats[:,1]\n",
    "        x = self.drop1(torch.cat([self.u(users),self.m(movies)], dim=1))\n",
    "        x = self.drop2(F.relu(self.lin1(x)))\n",
    "        kValue = self.kv.data.cpu()[0]\n",
    "        return self.sigscale1(self.lin2(x),self.kv)\n",
    "        #return F.sigmoid(self.lin2(x)) * (max_rating-min_rating+1) + min_rating-0.5 #scrunches 0 to 5.5 but no symmetry\n",
    "          "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "wd=1e-5\n",
    "model = EmbeddingNet(n_users, n_movies).cuda()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Save initial state once, and restore here to make consistent comparisons.\n",
    "if False:\n",
    "    torch.cuda.manual_seed_all(11111)\n",
    "    torch.manual_seed(11111)\n",
    "    model.load_state_dict(torch.load(\"MLModel.pt\"))\n",
    "else:\n",
    "    torch.save(model.state_dict(), \"MLModel.pt\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "2d3e3ae8e0fb4f3e931f74dbc830b970",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(IntProgress(value=0, description='Epoch', max=12), HTML(value='')))"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "epoch      trn_loss   val_loss                                  \n",
      "    0      0.858251   0.812676  \n",
      "    1      0.843336   0.79294                                   \n",
      "    2      0.778295   0.784312                                  \n",
      "    3      0.76255    0.787983                                  \n",
      "    4      0.758702   0.791481                                  \n",
      "    5      0.752285   0.78832                                   \n",
      "    6      0.745778   0.791469                                  \n",
      "    7      0.729907   0.794116                                  \n",
      "    8      0.760971   0.790681                                  \n",
      "    9      0.747286   0.79349                                   \n",
      "    10     0.716794   0.792652                                  \n",
      "    11     0.736462   0.794784                                  \n",
      "\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "[array([0.79478])]"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "opt = optim.Adam(model.parameters(), 1e-3, weight_decay=wd)\n",
    "\n",
    "fit(model, data, 12, opt, F.mse_loss)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.5"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "kValue"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "set_lrs(opt, 1e-3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "e90674bd1f5e47c2965bb9064dd6d097",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(IntProgress(value=0, description='Epoch', max=3), HTML(value='')))"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "epoch      trn_loss   val_loss                                  \n",
      "    0      0.717925   0.792505  \n",
      "    1      0.745821   0.793137                                  \n",
      "    2      0.712673   0.7943                                    \n",
      "\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "[array([0.7943])]"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "fit(model, data, 3, opt, F.mse_loss)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  },
  "toc": {
   "colors": {
    "hover_highlight": "#DAA520",
    "navigate_num": "#000000",
    "navigate_text": "#333333",
    "running_highlight": "#FF0000",
    "selected_highlight": "#FFD700",
    "sidebar_border": "#EEEEEE",
    "wrapper_background": "#FFFFFF"
   },
   "moveMenuLeft": true,
   "nav_menu": {
    "height": "123px",
    "width": "252px"
   },
   "navigate_menu": true,
   "number_sections": true,
   "sideBar": true,
   "threshold": 4,
   "toc_cell": false,
   "toc_section_display": "block",
   "toc_window_display": false,
   "widenNotebook": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {
	"heading_collapsed": true
	},
	"source": [
	"## Movielens"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {
	"hidden": true
	},
	"outputs": [],
	"source": [
	"%reload_ext autoreload\n",
	"%autoreload 2\n",
	"%matplotlib inline\n",
	"\n",
	"from fastai.learner import *\n",
	"from fastai.column_data import *"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"hidden": true
	},
	"source": [
	"Data available from http://files.grouplens.org/datasets/movielens/ml-latest-small.zip"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {
	"hidden": true
	},
	"outputs": [],
	"source": [
	"path='data/ml-latest-small/'"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"hidden": true
	},
	"source": [
	"We're working with the movielens data, which contains one rating per row, like this:"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {
	"hidden": true
	},
	"outputs": [],
	"source": [
	"ratings = pd.read_csv(path+'ratings.csv')"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"heading_collapsed": true
	},
	"source": [
	"## Collaborative filtering"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {
	"hidden": true
	},
	"outputs": [],
	"source": [
	"val_idxs = get_cv_idxs(len(ratings))\n",
	"n_factors = 50"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {
	"hidden": true
	},
	"outputs": [],
	"source": [
	"#cf = CollabFilterDataset.from_csv(path, 'ratings.csv', 'userId', 'movieId', 'rating')\n",
	"#learn = cf.get_learner(n_factors, val_idxs, 64, opt_fn=optim.Adam)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {
	"hidden": true
	},
	"outputs": [],
	"source": [
	"u_uniq = ratings.userId.unique()\n",
	"user2idx = {o:i for i,o in enumerate(u_uniq)}\n",
	"ratings.userId = ratings.userId.apply(lambda x: user2idx[x])\n",
	"\n",
	"m_uniq = ratings.movieId.unique()\n",
	"movie2idx = {o:i for i,o in enumerate(m_uniq)}\n",
	"ratings.movieId = ratings.movieId.apply(lambda x: movie2idx[x])\n",
	"\n",
	"n_users=int(ratings.userId.nunique())\n",
	"n_movies=int(ratings.movieId.nunique())"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"metadata": {
	"hidden": true
	},
	"outputs": [],
	"source": [
	"x = ratings.drop(['rating', 'timestamp'],axis=1)\n",
	"y = ratings['rating'].astype(np.float32)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"metadata": {
	"hidden": true
	},
	"outputs": [],
	"source": [
	"data = ColumnarModelData.from_data_frame(path, val_idxs, x, y, ['userId', 'movieId'], 64)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 9,
	"metadata": {
	"hidden": true
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"(0.5, 5.0)"
	]
	},
	"execution_count": 9,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"min_rating,max_rating = ratings.rating.min().item(),ratings.rating.max().item()\n",
	"min_rating,max_rating"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 10,
	"metadata": {},
	"outputs": [],
	"source": [
	"#next(iter(data.trn_dl))"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Mini net"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 11,
	"metadata": {
	"code_folding": []
	},
	"outputs": [],
	"source": [
	"kValue = -1\n",
	"class EmbeddingNet(nn.Module):\n",
	" def __init__(self, n_users, n_movies, nh=10, p1=0.05, p2=0.5):\n",
	" super().__init__()\n",
	" (self.u, self.m) = [get_emb(*o) for o in [\n",
	" (n_users, n_factors), (n_movies, n_factors)]]\n",
	" self.lin1 = nn.Linear(n_factors*2, nh)\n",
	" self.lin2 = nn.Linear(nh, 1)\n",
	" self.drop1 = nn.Dropout(p1)\n",
	" self.drop2 = nn.Dropout(p2)\n",
	" self.kv = nn.Parameter(torch.FloatTensor(1).fill_(.5), requires_grad=True).cuda() #set k initial value\n",
	" \n",
	" def sigscale1(self,y,k):\n",
	" return F.sigmoid(y) * (max_rating-min_rating+2*k) + min_rating-k\n",
	" \n",
	" def forward(self, cats, conts):\n",
	" global kValue\n",
	" users,movies = cats[:,0],cats[:,1]\n",
	" x = self.drop1(torch.cat([self.u(users),self.m(movies)], dim=1))\n",
	" x = self.drop2(F.relu(self.lin1(x)))\n",
	" kValue = self.kv.data.cpu()[0]\n",
	" return self.sigscale1(self.lin2(x),self.kv)\n",
	" #return F.sigmoid(self.lin2(x)) * (max_rating-min_rating+1) + min_rating-0.5 #scrunches 0 to 5.5 but no symmetry\n",
	" "
	]
	},
	{
	"cell_type": "code",
	"execution_count": 12,
	"metadata": {},
	"outputs": [],
	"source": [
	"wd=1e-5\n",
	"model = EmbeddingNet(n_users, n_movies).cuda()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 13,
	"metadata": {},
	"outputs": [],
	"source": [
	"#Save initial state once, and restore here to make consistent comparisons.\n",
	"if False:\n",
	" torch.cuda.manual_seed_all(11111)\n",
	" torch.manual_seed(11111)\n",
	" model.load_state_dict(torch.load(\"MLModel.pt\"))\n",
	"else:\n",
	" torch.save(model.state_dict(), \"MLModel.pt\")\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 14,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"application/vnd.jupyter.widget-view+json": {
	"model_id": "2d3e3ae8e0fb4f3e931f74dbc830b970",
	"version_major": 2,
	"version_minor": 0
	},
	"text/plain": [
	"HBox(children=(IntProgress(value=0, description='Epoch', max=12), HTML(value='')))"
	]
	},
	"metadata": {},
	"output_type": "display_data"
	},
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"epoch trn_loss val_loss \n",
	" 0 0.858251 0.812676 \n",
	" 1 0.843336 0.79294 \n",
	" 2 0.778295 0.784312 \n",
	" 3 0.76255 0.787983 \n",
	" 4 0.758702 0.791481 \n",
	" 5 0.752285 0.78832 \n",
	" 6 0.745778 0.791469 \n",
	" 7 0.729907 0.794116 \n",
	" 8 0.760971 0.790681 \n",
	" 9 0.747286 0.79349 \n",
	" 10 0.716794 0.792652 \n",
	" 11 0.736462 0.794784 \n",
	"\n"
	]
	},
	{
	"data": {
	"text/plain": [
	"[array([0.79478])]"
	]
	},
	"execution_count": 14,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"opt = optim.Adam(model.parameters(), 1e-3, weight_decay=wd)\n",
	"\n",
	"fit(model, data, 12, opt, F.mse_loss)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 15,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"0.5"
	]
	},
	"execution_count": 15,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"kValue"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 16,
	"metadata": {},
	"outputs": [],
	"source": [
	"set_lrs(opt, 1e-3)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 17,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"application/vnd.jupyter.widget-view+json": {
	"model_id": "e90674bd1f5e47c2965bb9064dd6d097",
	"version_major": 2,
	"version_minor": 0
	},
	"text/plain": [
	"HBox(children=(IntProgress(value=0, description='Epoch', max=3), HTML(value='')))"
	]
	},
	"metadata": {},
	"output_type": "display_data"
	},
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"epoch trn_loss val_loss \n",
	" 0 0.717925 0.792505 \n",
	" 1 0.745821 0.793137 \n",
	" 2 0.712673 0.7943 \n",
	"\n"
	]
	},
	{
	"data": {
	"text/plain": [
	"[array([0.7943])]"
	]
	},
	"execution_count": 17,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"fit(model, data, 3, opt, F.mse_loss)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.6.5"
	},
	"toc": {
	"colors": {
	"hover_highlight": "#DAA520",
	"navigate_num": "#000000",
	"navigate_text": "#333333",
	"running_highlight": "#FF0000",
	"selected_highlight": "#FFD700",
	"sidebar_border": "#EEEEEE",
	"wrapper_background": "#FFFFFF"
	},
	"moveMenuLeft": true,
	"nav_menu": {
	"height": "123px",
	"width": "252px"
	},
	"navigate_menu": true,
	"number_sections": true,
	"sideBar": true,
	"threshold": 4,
	"toc_cell": false,
	"toc_section_display": "block",
	"toc_window_display": false,
	"widenNotebook": false
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}