Skip to content

Instantly share code, notes, and snippets.

@guilhermebene
Last active May 7, 2021 13:38
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save guilhermebene/7ceeae528ef003b8a2c5901da6ee9c99 to your computer and use it in GitHub Desktop.
Save guilhermebene/7ceeae528ef003b8a2c5901da6ee9c99 to your computer and use it in GitHub Desktop.
Content-based Filtering
Display the source blob
Display the rendered blob
Raw
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "Content-based Filtering",
"provenance": [],
"collapsed_sections": [],
"authorship_tag": "ABX9TyM2PvcUxBHqBR0Kgg6logiI",
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/gist/guilhermebene/7ceeae528ef003b8a2c5901da6ee9c99/content-based-filtering.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "code",
"metadata": {
"id": "klZyzzbOEW_a",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 469
},
"outputId": "64055770-e634-41d8-847c-56b7e9b73503"
},
"source": [
"!pip install sklearn\n",
"!pip install matplotlib\n",
"!pip install ml_metrics"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"text": [
"Requirement already satisfied: sklearn in /usr/local/lib/python3.6/dist-packages (0.0)\n",
"Requirement already satisfied: scikit-learn in /usr/local/lib/python3.6/dist-packages (from sklearn) (0.22.2.post1)\n",
"Requirement already satisfied: scipy>=0.17.0 in /usr/local/lib/python3.6/dist-packages (from scikit-learn->sklearn) (1.4.1)\n",
"Requirement already satisfied: numpy>=1.11.0 in /usr/local/lib/python3.6/dist-packages (from scikit-learn->sklearn) (1.18.5)\n",
"Requirement already satisfied: joblib>=0.11 in /usr/local/lib/python3.6/dist-packages (from scikit-learn->sklearn) (0.16.0)\n",
"Requirement already satisfied: matplotlib in /usr/local/lib/python3.6/dist-packages (3.2.2)\n",
"Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib) (1.2.0)\n",
"Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib) (2.4.7)\n",
"Requirement already satisfied: numpy>=1.11 in /usr/local/lib/python3.6/dist-packages (from matplotlib) (1.18.5)\n",
"Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.6/dist-packages (from matplotlib) (0.10.0)\n",
"Requirement already satisfied: python-dateutil>=2.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib) (2.8.1)\n",
"Requirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from cycler>=0.10->matplotlib) (1.12.0)\n",
"Collecting ml_metrics\n",
" Downloading https://files.pythonhosted.org/packages/c1/e7/c31a2dd37045a0c904bee31c2dbed903d4f125a6ce980b91bae0c961abb8/ml_metrics-0.1.4.tar.gz\n",
"Requirement already satisfied: numpy in /usr/local/lib/python3.6/dist-packages (from ml_metrics) (1.18.5)\n",
"Requirement already satisfied: pandas in /usr/local/lib/python3.6/dist-packages (from ml_metrics) (1.0.5)\n",
"Requirement already satisfied: python-dateutil>=2.6.1 in /usr/local/lib/python3.6/dist-packages (from pandas->ml_metrics) (2.8.1)\n",
"Requirement already satisfied: pytz>=2017.2 in /usr/local/lib/python3.6/dist-packages (from pandas->ml_metrics) (2018.9)\n",
"Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.6/dist-packages (from python-dateutil>=2.6.1->pandas->ml_metrics) (1.12.0)\n",
"Building wheels for collected packages: ml-metrics\n",
" Building wheel for ml-metrics (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
" Created wheel for ml-metrics: filename=ml_metrics-0.1.4-cp36-none-any.whl size=7850 sha256=d6d404fb7cc21699de988380978695f01e91ecc323a7d536aaecd4c6d94805b5\n",
" Stored in directory: /root/.cache/pip/wheels/b3/61/2d/776be7b8a4f14c5db48c8e5451451cabc58dc6aa7ee3801163\n",
"Successfully built ml-metrics\n",
"Installing collected packages: ml-metrics\n",
"Successfully installed ml-metrics-0.1.4\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "NFp0Fo4B5oTC",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 124
},
"outputId": "6dd4f7a1-ee7f-4f6f-fd48-fcfa0a50607e"
},
"source": [
"from google.colab import drive\n",
"drive.mount('/gdrive/',force_remount=True)"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"text": [
"Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly\n",
"\n",
"Enter your authorization code:\n",
"··········\n",
"Mounted at /gdrive/\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "LOtNnikUEhMd",
"colab_type": "code",
"colab": {}
},
"source": [
"import os\n",
"os.chdir(\"/gdrive/My Drive/BIOMED/Content-Based Filtering/Code\")\n",
"\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.datasets import make_classification\n",
"from sklearn.metrics import confusion_matrix\n",
"import numpy as np\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"from sklearn.naive_bayes import BernoulliNB, MultinomialNB\n",
"from sklearn.model_selection import train_test_split\n",
"from random import randint\n",
"import ml_metrics\n",
"%matplotlib inline"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "ioRkUIyR0a2q",
"colab_type": "text"
},
"source": [
"# Dealing with the data"
]
},
{
"cell_type": "code",
"metadata": {
"id": "qBLwekj8-b4R",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 459
},
"outputId": "173bc4ca-d93d-4ec3-f8f5-d17ae6ac3d58"
},
"source": [
"%%time\n",
"movies = pd.read_csv('movies.csv')\n",
"\n",
"# Removing no-genre movies\n",
"movies.drop(movies[movies['genres'].str.contains(\"no genres\")].index,inplace=True)\n",
"\n",
"genres = ['Action','Adventure','Animation','Children','Comedy','Crime','Documentary','Drama','Fantasy','Film-Noir','Horror','Musical','Mystery','Romance','Sci-Fi','Thriller','War','Western','IMAX']\n",
"frequency = []\n",
"for genre in genres:\n",
" genre_occurance = []\n",
" for index, row in movies.iterrows():\n",
" if genre in row['genres']:\n",
" genre_occurance.append(True)\n",
" else:\n",
" genre_occurance.append(False)\n",
"\n",
" movies[genre] = np.asarray(genre_occurance,dtype=np.int32)\n",
" frequency.append(np.sum(genre_occurance))\n",
" \n",
"# Removing the genres column\n",
"movies.drop('genres',inplace=True,axis=1)\n",
"display(movies)"
],
"execution_count": null,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>movieId</th>\n",
" <th>title</th>\n",
" <th>Action</th>\n",
" <th>Adventure</th>\n",
" <th>Animation</th>\n",
" <th>Children</th>\n",
" <th>Comedy</th>\n",
" <th>Crime</th>\n",
" <th>Documentary</th>\n",
" <th>Drama</th>\n",
" <th>Fantasy</th>\n",
" <th>Film-Noir</th>\n",
" <th>Horror</th>\n",
" <th>Musical</th>\n",
" <th>Mystery</th>\n",
" <th>Romance</th>\n",
" <th>Sci-Fi</th>\n",
" <th>Thriller</th>\n",
" <th>War</th>\n",
" <th>Western</th>\n",
" <th>IMAX</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>Toy Story (1995)</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>Jumanji (1995)</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>Grumpier Old Men (1995)</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>Waiting to Exhale (1995)</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>5</td>\n",
" <td>Father of the Bride Part II (1995)</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9737</th>\n",
" <td>193581</td>\n",
" <td>Black Butler: Book of the Atlantic (2017)</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9738</th>\n",
" <td>193583</td>\n",
" <td>No Game No Life: Zero (2017)</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9739</th>\n",
" <td>193585</td>\n",
" <td>Flint (2017)</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9740</th>\n",
" <td>193587</td>\n",
" <td>Bungo Stray Dogs: Dead Apple (2018)</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9741</th>\n",
" <td>193609</td>\n",
" <td>Andrew Dice Clay: Dice Rules (1991)</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>9708 rows × 21 columns</p>\n",
"</div>"
],
"text/plain": [
" movieId title ... Western IMAX\n",
"0 1 Toy Story (1995) ... 0 0\n",
"1 2 Jumanji (1995) ... 0 0\n",
"2 3 Grumpier Old Men (1995) ... 0 0\n",
"3 4 Waiting to Exhale (1995) ... 0 0\n",
"4 5 Father of the Bride Part II (1995) ... 0 0\n",
"... ... ... ... ... ...\n",
"9737 193581 Black Butler: Book of the Atlantic (2017) ... 0 0\n",
"9738 193583 No Game No Life: Zero (2017) ... 0 0\n",
"9739 193585 Flint (2017) ... 0 0\n",
"9740 193587 Bungo Stray Dogs: Dead Apple (2018) ... 0 0\n",
"9741 193609 Andrew Dice Clay: Dice Rules (1991) ... 0 0\n",
"\n",
"[9708 rows x 21 columns]"
]
},
"metadata": {
"tags": []
}
},
{
"output_type": "stream",
"text": [
"CPU times: user 16.7 s, sys: 11.5 ms, total: 16.7 s\n",
"Wall time: 16.8 s\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "hONvsBw52UQf",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 1000
},
"outputId": "66c465c6-ccf2-4e13-bbdc-168ff8a65935"
},
"source": [
"data_overview = pd.DataFrame(data=list(zip(genres,frequency)),columns=['Genres','No. of occurences']).sort_values('No. of occurences',ascending=False)\n",
"display(data_overview)\n",
"\n",
"fig = plt.figure(1, figsize=(18,13))\n",
"ax2 = fig.add_subplot(2,1,2)\n",
"plt.xticks(rotation=85, fontsize = 15)\n",
"plt.yticks(fontsize = 15)\n",
"plt.ylabel(\"No. of occurences\", fontsize = 24, labelpad = 0)\n",
"ax2.bar(data_overview['Genres'].tolist(), data_overview['No. of occurences'].tolist(), align = 'center', color='r')\n",
"plt.title(\"Popularity of Genres\",bbox={'facecolor':'k', 'pad':5},color='w',fontsize = 30)\n",
"plt.show()"
],
"execution_count": null,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Genres</th>\n",
" <th>No. of occurences</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>Drama</td>\n",
" <td>4361</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Comedy</td>\n",
" <td>3756</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>Thriller</td>\n",
" <td>1894</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Action</td>\n",
" <td>1828</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>Romance</td>\n",
" <td>1596</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Adventure</td>\n",
" <td>1263</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>Crime</td>\n",
" <td>1199</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>Sci-Fi</td>\n",
" <td>980</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>Horror</td>\n",
" <td>978</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>Fantasy</td>\n",
" <td>779</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Children</td>\n",
" <td>664</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Animation</td>\n",
" <td>611</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>Mystery</td>\n",
" <td>573</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>Documentary</td>\n",
" <td>440</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>War</td>\n",
" <td>382</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>Musical</td>\n",
" <td>334</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>Western</td>\n",
" <td>167</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>IMAX</td>\n",
" <td>158</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>Film-Noir</td>\n",
" <td>87</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Genres No. of occurences\n",
"7 Drama 4361\n",
"4 Comedy 3756\n",
"15 Thriller 1894\n",
"0 Action 1828\n",
"13 Romance 1596\n",
"1 Adventure 1263\n",
"5 Crime 1199\n",
"14 Sci-Fi 980\n",
"10 Horror 978\n",
"8 Fantasy 779\n",
"3 Children 664\n",
"2 Animation 611\n",
"12 Mystery 573\n",
"6 Documentary 440\n",
"16 War 382\n",
"11 Musical 334\n",
"17 Western 167\n",
"18 IMAX 158\n",
"9 Film-Noir 87"
]
},
"metadata": {
"tags": []
}
},
{
"output_type": "display_data",
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 1296x936 with 1 Axes>"
]
},
"metadata": {
"tags": [],
"needs_background": "light"
}
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "8cSC1TDr81_l",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 424
},
"outputId": "1942e5ad-84b9-4149-e82c-cb89edfd2c97"
},
"source": [
"ratings = pd.read_csv('ratings.csv')\n",
"ratings.drop('timestamp',inplace=True,axis=1)\n",
"display(ratings)\n",
"\n",
"users = list(dict.fromkeys(ratings['userId'].tolist()))"
],
"execution_count": null,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>userId</th>\n",
" <th>movieId</th>\n",
" <th>rating</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>4.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>4.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>6</td>\n",
" <td>4.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>47</td>\n",
" <td>5.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1</td>\n",
" <td>50</td>\n",
" <td>5.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>100831</th>\n",
" <td>610</td>\n",
" <td>166534</td>\n",
" <td>4.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>100832</th>\n",
" <td>610</td>\n",
" <td>168248</td>\n",
" <td>5.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>100833</th>\n",
" <td>610</td>\n",
" <td>168250</td>\n",
" <td>5.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>100834</th>\n",
" <td>610</td>\n",
" <td>168252</td>\n",
" <td>5.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>100835</th>\n",
" <td>610</td>\n",
" <td>170875</td>\n",
" <td>3.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>100836 rows × 3 columns</p>\n",
"</div>"
],
"text/plain": [
" userId movieId rating\n",
"0 1 1 4.0\n",
"1 1 3 4.0\n",
"2 1 6 4.0\n",
"3 1 47 5.0\n",
"4 1 50 5.0\n",
"... ... ... ...\n",
"100831 610 166534 4.0\n",
"100832 610 168248 5.0\n",
"100833 610 168250 5.0\n",
"100834 610 168252 5.0\n",
"100835 610 170875 3.0\n",
"\n",
"[100836 rows x 3 columns]"
]
},
"metadata": {
"tags": []
}
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "QQEmDI1t0l2e",
"colab_type": "text"
},
"source": [
"# Testing the algorithm"
]
},
{
"cell_type": "code",
"metadata": {
"id": "PfGlA-6g55rt",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 528
},
"outputId": "909af108-af51-4d1a-8b1f-0c91339b9591"
},
"source": [
"user = randint(0,610)\n",
"data = movies.merge(ratings[ratings.userId == user], on='movieId',how='inner')\n",
"print(\"User: {}\".format(user))\n",
"print(\"Movies watched: \"+ str(data.shape[0]))\n",
"display(data)"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"text": [
"User: 62\n",
"Movies watched: 363\n"
],
"name": "stdout"
},
{
"output_type": "display_data",
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>movieId</th>\n",
" <th>title</th>\n",
" <th>Action</th>\n",
" <th>Adventure</th>\n",
" <th>Animation</th>\n",
" <th>Children</th>\n",
" <th>Comedy</th>\n",
" <th>Crime</th>\n",
" <th>Documentary</th>\n",
" <th>Drama</th>\n",
" <th>Fantasy</th>\n",
" <th>Film-Noir</th>\n",
" <th>Horror</th>\n",
" <th>Musical</th>\n",
" <th>Mystery</th>\n",
" <th>Romance</th>\n",
" <th>Sci-Fi</th>\n",
" <th>Thriller</th>\n",
" <th>War</th>\n",
" <th>Western</th>\n",
" <th>IMAX</th>\n",
" <th>userId</th>\n",
" <th>rating</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>2</td>\n",
" <td>Jumanji (1995)</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>62</td>\n",
" <td>4.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>6</td>\n",
" <td>Heat (1995)</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>62</td>\n",
" <td>4.5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>47</td>\n",
" <td>Seven (a.k.a. Se7en) (1995)</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>62</td>\n",
" <td>4.5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>110</td>\n",
" <td>Braveheart (1995)</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>62</td>\n",
" <td>4.5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>260</td>\n",
" <td>Star Wars: Episode IV - A New Hope (1977)</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>62</td>\n",
" <td>4.5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>358</th>\n",
" <td>184471</td>\n",
" <td>Tomb Raider (2018)</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>62</td>\n",
" <td>3.5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>359</th>\n",
" <td>185031</td>\n",
" <td>Alpha (2018)</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>62</td>\n",
" <td>4.5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>360</th>\n",
" <td>185135</td>\n",
" <td>Sherlock - A Study in Pink (2010)</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>62</td>\n",
" <td>5.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>361</th>\n",
" <td>187593</td>\n",
" <td>Deadpool 2 (2018)</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>62</td>\n",
" <td>4.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>362</th>\n",
" <td>187595</td>\n",
" <td>Solo: A Star Wars Story (2018)</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>62</td>\n",
" <td>4.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>363 rows × 23 columns</p>\n",
"</div>"
],
"text/plain": [
" movieId title ... userId rating\n",
"0 2 Jumanji (1995) ... 62 4.0\n",
"1 6 Heat (1995) ... 62 4.5\n",
"2 47 Seven (a.k.a. Se7en) (1995) ... 62 4.5\n",
"3 110 Braveheart (1995) ... 62 4.5\n",
"4 260 Star Wars: Episode IV - A New Hope (1977) ... 62 4.5\n",
".. ... ... ... ... ...\n",
"358 184471 Tomb Raider (2018) ... 62 3.5\n",
"359 185031 Alpha (2018) ... 62 4.5\n",
"360 185135 Sherlock - A Study in Pink (2010) ... 62 5.0\n",
"361 187593 Deadpool 2 (2018) ... 62 4.0\n",
"362 187595 Solo: A Star Wars Story (2018) ... 62 4.0\n",
"\n",
"[363 rows x 23 columns]"
]
},
"metadata": {
"tags": []
}
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "CsPvIYHscBtf",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 424
},
"outputId": "0af7f7ad-fda5-4655-ffdc-f6355732d1b4"
},
"source": [
"X = data.drop(['title','movieId','userId','rating'],inplace=False,axis=1)\n",
"display(X)"
],
"execution_count": null,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Action</th>\n",
" <th>Adventure</th>\n",
" <th>Animation</th>\n",
" <th>Children</th>\n",
" <th>Comedy</th>\n",
" <th>Crime</th>\n",
" <th>Documentary</th>\n",
" <th>Drama</th>\n",
" <th>Fantasy</th>\n",
" <th>Film-Noir</th>\n",
" <th>Horror</th>\n",
" <th>Musical</th>\n",
" <th>Mystery</th>\n",
" <th>Romance</th>\n",
" <th>Sci-Fi</th>\n",
" <th>Thriller</th>\n",
" <th>War</th>\n",
" <th>Western</th>\n",
" <th>IMAX</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>358</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>359</th>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>360</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>361</th>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>362</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>363 rows × 19 columns</p>\n",
"</div>"
],
"text/plain": [
" Action Adventure Animation Children ... Thriller War Western IMAX\n",
"0 0 1 0 1 ... 0 0 0 0\n",
"1 1 0 0 0 ... 1 0 0 0\n",
"2 0 0 0 0 ... 1 0 0 0\n",
"3 1 0 0 0 ... 0 1 0 0\n",
"4 1 1 0 0 ... 0 0 0 0\n",
".. ... ... ... ... ... ... ... ... ...\n",
"358 1 1 0 0 ... 0 0 0 0\n",
"359 0 1 0 0 ... 1 0 0 0\n",
"360 0 0 0 0 ... 0 0 0 0\n",
"361 1 0 0 0 ... 0 0 0 0\n",
"362 1 1 0 1 ... 0 0 0 0\n",
"\n",
"[363 rows x 19 columns]"
]
},
"metadata": {
"tags": []
}
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "4oZUqLyfcChM",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 312
},
"outputId": "6073575a-49f2-4aaa-cc51-f9589611e357"
},
"source": [
"Y = np.asarray(data['rating'],dtype=np.int32)\n",
"display(Y)"
],
"execution_count": null,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/plain": [
"array([4, 4, 4, 4, 4, 5, 4, 4, 3, 5, 4, 4, 4, 5, 4, 5, 5, 4, 4, 4, 4, 5,\n",
" 4, 5, 4, 4, 4, 4, 3, 4, 5, 5, 4, 5, 4, 4, 5, 5, 4, 5, 4, 4, 5, 4,\n",
" 5, 3, 4, 4, 3, 5, 4, 3, 5, 4, 4, 3, 3, 4, 4, 4, 2, 4, 4, 5, 4, 5,\n",
" 3, 1, 3, 4, 4, 4, 4, 5, 4, 3, 4, 4, 4, 4, 3, 4, 5, 5, 4, 4, 5, 4,\n",
" 4, 3, 3, 5, 4, 3, 4, 3, 3, 3, 5, 5, 4, 5, 4, 4, 4, 3, 5, 4, 1, 4,\n",
" 2, 4, 4, 4, 4, 4, 4, 3, 5, 4, 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 3, 5,\n",
" 3, 4, 4, 4, 4, 4, 4, 4, 5, 3, 3, 3, 4, 3, 4, 4, 5, 4, 3, 4, 4, 4,\n",
" 5, 4, 4, 4, 1, 5, 3, 4, 4, 4, 4, 1, 4, 5, 3, 4, 4, 4, 5, 4, 4, 4,\n",
" 4, 3, 3, 5, 4, 3, 4, 3, 4, 3, 3, 3, 4, 3, 4, 4, 4, 1, 4, 4, 4, 4,\n",
" 2, 3, 4, 4, 5, 4, 4, 4, 4, 3, 3, 4, 5, 3, 4, 5, 3, 3, 1, 5, 4, 4,\n",
" 4, 3, 4, 5, 2, 4, 4, 4, 4, 4, 4, 4, 3, 2, 4, 4, 4, 3, 4, 4, 4, 3,\n",
" 4, 5, 5, 4, 3, 3, 4, 4, 5, 4, 3, 4, 4, 4, 5, 4, 4, 4, 3, 4, 3, 4,\n",
" 4, 3, 4, 4, 3, 3, 3, 4, 5, 4, 4, 4, 4, 3, 4, 5, 5, 4, 4, 4, 3, 3,\n",
" 4, 4, 4, 4, 4, 4, 3, 4, 4, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 4,\n",
" 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 3, 3, 4, 3, 4, 2, 4, 4, 3,\n",
" 4, 4, 4, 3, 5, 4, 4, 3, 4, 4, 4, 4, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4,\n",
" 3, 4, 3, 3, 4, 4, 3, 4, 5, 4, 4], dtype=int32)"
]
},
"metadata": {
"tags": []
}
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "bw128tcaFGpJ",
"colab_type": "code",
"colab": {}
},
"source": [
"# Separating test movies from training movies\n",
"trainX,testX,trainY,testY = train_test_split(X.values,Y,test_size=0.3)"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "LkmrLFjBj-PX",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 52
},
"outputId": "e41d7833-1f82-47ed-d284-3479a918829f"
},
"source": [
"bnb = BernoulliNB(binarize=0.0)\n",
"bnb.fit(trainX, trainY)\n",
"score = bnb.score(testX,testY)\n",
"print(\"User: {}\\nFinal score: {}\".format(user,score))"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"text": [
"User: 62\n",
"Final score: 0.5596330275229358\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "1Mo3LpT1As8h",
"colab_type": "text"
},
"source": [
"# Recommender Evaluation\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "8I0rXHnMa_UB",
"colab_type": "text"
},
"source": [
"## First method"
]
},
{
"cell_type": "code",
"metadata": {
"id": "N-lFtwdIA0Ne",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 1000
},
"outputId": "2e58e13c-082d-4173-c21f-90c882f0a3c2"
},
"source": [
"scores = []\n",
"final_precisions = []\n",
"final_accuracy = []\n",
"final_recalls = []\n",
"final_f1 = []\n",
"selected_users = []\n",
"set_sizes = []\n",
"train_sizes = []\n",
"test_sizes = []\n",
"final_tp = []\n",
"final_fp = []\n",
"final_fn = []\n",
"final_tn = []\n",
"predictions = []\n",
"test_sets = []\n",
"errors = []\n",
"unsuficient = 0\n",
"\n",
"user_amount = 200 # Amount of users for testing\n",
"i = 0\n",
"\n",
"while len(selected_users) < user_amount:\n",
"\n",
" user = randint(0,610)\n",
"\n",
" if user in selected_users:\n",
" continue\n",
"\n",
" # user = 599\n",
" data = movies.merge(ratings[ratings.userId == user], on='movieId',how='inner')\n",
"\n",
" X = data.drop(['title','movieId','userId','rating'],inplace=False,axis=1)\n",
" Y = np.asarray(data['rating'],dtype=np.int32)\n",
"\n",
" # User must have watched at least 33 movies so they will have a testing set with at least 10 movies\n",
" if len(Y) < 33:\n",
" unsuficient = unsuficient + 1\n",
" continue\n",
"\n",
" # Splitting into liked and disliked movies (if rating is greater or equal to 4)\n",
" for index,rating in enumerate(Y):\n",
" if rating >= 4:\n",
" Y[index] = 1\n",
" else:\n",
" Y[index] = 0\n",
"\n",
" # Separating test movies from training movies\n",
" trainX,testX,trainY,testY = train_test_split(X.values,Y,test_size=0.3,shuffle=True)\n",
" bnb = BernoulliNB(binarize=0.0)\n",
" bnb.fit(trainX, trainY)\n",
" score = bnb.score(testX,testY)\n",
" scores.append(score)\n",
" predicted = bnb.predict(testX)\n",
"\n",
" predictions.append(predicted)\n",
" test_sets.append(testY)\n",
" \n",
" tp = 0 # True positives\n",
" fp = 0 # False positives\n",
" fn = 0 # False negatives\n",
" tn = 0 # True negatives\n",
"\n",
" for index,recommendation in enumerate(predicted):\n",
" if recommendation == 1:\n",
" if recommendation == testY[index]:\n",
" tp = tp + 1\n",
" else:\n",
" fp = fp + 1\n",
" else:\n",
" if recommendation == testY[index]:\n",
" tn = tn + 1\n",
" else:\n",
" fn = fn + 1\n",
" # conf = confusion_matrix(testY,predicted)\n",
" # tn, fp, fn, tp = conf.ravel()\n",
" \n",
" final_tp.append(tp)\n",
" final_tn.append(tn)\n",
" final_fn.append(fn)\n",
" final_fp.append(fp)\n",
" \n",
" try: # An error happens if the algorithm fails to predict any positive value\n",
" precision = (tp/(tp+fp))\n",
" accuracy = ((tp+tn)/(tp+fp+fn+tn))\n",
" recall = (tp/(tp+fn))\n",
" f1 = (2*recall*precision)/(recall+precision)\n",
"\n",
" except:\n",
" errors.append(data)\n",
" continue\n",
"\n",
" final_precisions.append(precision)\n",
" final_accuracy.append(accuracy)\n",
" final_recalls.append(recall)\n",
" final_f1.append(f1)\n",
"\n",
" selected_users.append(user)\n",
" set_sizes.append(data.shape[0])\n",
" train_sizes.append(len(trainY))\n",
" test_sizes.append(len(testY))\n",
"\n",
" i = i + 1\n",
"\n",
"results = pd.DataFrame(list(zip(selected_users,set_sizes,train_sizes,test_sizes,final_tp,final_fp,final_fn,final_tn,scores,final_precisions,final_accuracy,final_recalls,final_f1)), \n",
" columns =['userId','Movies watched','Training set','Testing set','TP','FP','FN','TN','Score','Precision','Accuracy','Recall rate','F1-score'])\n",
"\n",
"print('Users with unsuficient data: {}'.format(unsuficient))\n",
"print('Errors: ' + str(len(errors)))\n",
"print('Selected Users: '+str(len(selected_users)))\n",
"\n",
"plt.plot(final_precisions,label='Precision')\n",
"plt.plot(final_accuracy,label='Accuracy')\n",
"plt.plot(final_recalls,label='Recall Rate')\n",
"plt.plot(final_f1,label='F1-score')\n",
"plt.xlabel('User')\n",
"plt.legend()\n",
"\n",
"results.head(user_amount)\n",
"\n",
"fig, axes = plt.subplots(nrows=2, ncols=2,figsize=(15,15))\n",
"axes[0,0].hist(final_precisions,bins=10)\n",
"axes[0,0].set_title('Precision')\n",
"axes[0,1].hist(final_accuracy,bins=10)\n",
"axes[0,1].set_title('Accuracy')\n",
"axes[1,0].hist(final_recalls,bins=10)\n",
"axes[1,0].set_title('Recall Rate')\n",
"axes[1,1].hist(final_f1,bins=10)\n",
"axes[1,1].set_title('F1-score')\n",
"\n",
"fig.subplots_adjust(hspace=0.4)"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"text": [
"Users with unsuficient data: 68\n",
"Errors: 8\n",
"Selected Users: 200\n"
],
"name": "stdout"
},
{
"output_type": "display_data",
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {
"tags": [],
"needs_background": "light"
}
},
{
"output_type": "display_data",
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 1080x1080 with 4 Axes>"
]
},
"metadata": {
"tags": [],
"needs_background": "light"
}
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "qMQLuC7yA3pV",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 318
},
"outputId": "b68a8509-172e-4313-cf04-11bf7b1bcb65"
},
"source": [
"users_error = [element['userId'].iloc[0] for element in errors]\n",
"movies_watched = [element.shape[0] for element in errors]\n",
"print('Users whose the predicitions failed:')\n",
"display(pd.DataFrame(data=list(zip(users_error,movies_watched)),columns=['User','Movies Watched']))"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"text": [
"Users whose the predicitions failed:\n"
],
"name": "stdout"
},
{
"output_type": "display_data",
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>User</th>\n",
" <th>Movies Watched</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>567</td>\n",
" <td>382</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>139</td>\n",
" <td>194</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>36</td>\n",
" <td>60</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>361</td>\n",
" <td>97</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>270</td>\n",
" <td>40</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>153</td>\n",
" <td>179</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>599</td>\n",
" <td>2474</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>510</td>\n",
" <td>108</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" User Movies Watched\n",
"0 567 382\n",
"1 139 194\n",
"2 36 60\n",
"3 361 97\n",
"4 270 40\n",
"5 153 179\n",
"6 599 2474\n",
"7 510 108"
]
},
"metadata": {
"tags": []
}
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "oZtKBCFOa26w",
"colab_type": "text"
},
"source": [
"## Coverage"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "OglZw3i5qTtX",
"colab_type": "text"
},
"source": [
"### Run at the beggining"
]
},
{
"cell_type": "code",
"metadata": {
"id": "2XD0XD7ca7AJ",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 52
},
"outputId": "5d82c74a-9554-4037-85a2-1d4d479c374c"
},
"source": [
"%%time\n",
"\n",
"selected_users = []\n",
"recommendations = []\n",
"user_coverage = []\n",
"\n",
"user_amount = len(users) # Amount of users for testing\n",
"list_size = [10, 20, 30, 50, 100] # Recommendation list size\n",
"\n",
"# while len(selected_users) < user_amount:\n",
"for user in users:\n",
"\n",
" # user = randint(0,610)\n",
" data = movies.merge(ratings[ratings.userId == user],on='movieId',how='inner')\n",
"\n",
" # Separating unwatched movies\n",
" unwatched = movies.merge(ratings[ratings.userId == user], on='movieId', how='left', indicator=True).query('_merge == \"left_only\"').drop('_merge', 1)\n",
" unwatched.drop(['userId','rating'],inplace=True,axis=1)\n",
" \n",
" X = data.drop(['title','movieId','userId','rating'],inplace=False,axis=1)\n",
" Y = np.asarray(data['rating'],dtype=np.int32)\n",
"\n",
" # User must have watched at least 33 movies\n",
" # if len(Y) < 33:\n",
" # continue\n",
"\n",
" # Splitting into liked and disliked movies (if rating is greater or equal to 4)\n",
" for index,rating in enumerate(Y):\n",
" if rating >= 4:\n",
" Y[index] = 1\n",
" else:\n",
" Y[index] = 0\n",
"\n",
" bnb = BernoulliNB(binarize=0.0)\n",
" bnb.fit(X,Y)\n",
" prediction = bnb.predict(unwatched.drop(['title','movieId'],inplace=False,axis=1))\n",
"\n",
" recommendation = unwatched.copy()\n",
" recommendation['rating'] = prediction\n",
"\n",
" recommendation.drop(recommendation[recommendation.rating == 0].index, inplace=True)\n",
"\n",
" selected_users.append(user)\n",
" recommendations.append(recommendation)"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"text": [
"CPU times: user 42.4 s, sys: 29.3 s, total: 1min 11s\n",
"Wall time: 36.4 s\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "6_iWByc7qdpq",
"colab_type": "text"
},
"source": [
"### User Coverage"
]
},
{
"cell_type": "code",
"metadata": {
"id": "Gl_g6kPRqjMW",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 241
},
"outputId": "65174050-e7fb-4767-86de-26ab1db5b914"
},
"source": [
"print('User coverage analysis')\n",
"print(\"Number of users: \" + str(user_amount))\n",
"\n",
"for size in list_size:\n",
" sum = 0\n",
" for recommendation in recommendations:\n",
" if recommendation.shape[0] < size:\n",
" sum = sum + recommendation.shape[0]/size\n",
" else:\n",
" sum = sum + 1\n",
" user_coverage.append(sum/user_amount)\n",
" # print('List size: {} User coverage: {}'.format(size,sum/user_amount))\n",
"\n",
"pd.DataFrame(list(zip(list_size,user_coverage)),columns=['Recommendation list size','User Coverage']).head()"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"text": [
"User coverage analysis\n",
"Number of users: 610\n"
],
"name": "stdout"
},
{
"output_type": "execute_result",
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Recommendation list size</th>\n",
" <th>User Coverage</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>10</td>\n",
" <td>0.995574</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>20</td>\n",
" <td>0.993852</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>30</td>\n",
" <td>0.993169</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>50</td>\n",
" <td>0.992033</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>100</td>\n",
" <td>0.988967</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Recommendation list size User Coverage\n",
"0 10 0.995574\n",
"1 20 0.993852\n",
"2 30 0.993169\n",
"3 50 0.992033\n",
"4 100 0.988967"
]
},
"metadata": {
"tags": []
},
"execution_count": 366
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "nT8bf-Rrqzhr",
"colab_type": "text"
},
"source": [
"### Catalog Coverage"
]
},
{
"cell_type": "code",
"metadata": {
"id": "mcEoQmchqylu",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 224
},
"outputId": "1a7f1fb1-e868-48c1-8832-5e82286a4c99"
},
"source": [
"catalog_coverage = []\n",
"incomplete_lists = []\n",
"\n",
"for size in list_size:\n",
"\n",
" catalog = pd.DataFrame(index=recommendations[0].index,columns=recommendations[0].columns)\n",
" catalog.dropna(inplace=True)\n",
"\n",
" incomplete = 0\n",
" for recommendation in recommendations:\n",
" if recommendation.shape[0] >= size:\n",
" catalog = catalog.append(recommendation.sample(n = size,replace=False))\n",
" else: \n",
" catalog = catalog.append(recommendation)\n",
" incomplete = incomplete + 1\n",
"\n",
" incomplete_lists.append(incomplete)\n",
" catalog.drop_duplicates(keep='first',inplace=True)\n",
" catalog_coverage.append(catalog.shape[0]/movies.shape[0])\n",
"\n",
"print('Catalog coverage analysis')\n",
"pd.DataFrame(list(zip(list_size,catalog_coverage,incomplete_lists)),columns=['Recommendation list size','Catalog Coverage','Incomplete lists']).head()"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"text": [
"Catalog coverage analysis\n"
],
"name": "stdout"
},
{
"output_type": "execute_result",
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Recommendation list size</th>\n",
" <th>Catalog Coverage</th>\n",
" <th>Incomplete lists</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>10</td>\n",
" <td>0.459827</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>20</td>\n",
" <td>0.695612</td>\n",
" <td>5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>30</td>\n",
" <td>0.828801</td>\n",
" <td>5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>50</td>\n",
" <td>0.945097</td>\n",
" <td>7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>100</td>\n",
" <td>0.996395</td>\n",
" <td>9</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Recommendation list size Catalog Coverage Incomplete lists\n",
"0 10 0.459827 3\n",
"1 20 0.695612 5\n",
"2 30 0.828801 5\n",
"3 50 0.945097 7\n",
"4 100 0.996395 9"
]
},
"metadata": {
"tags": []
},
"execution_count": 369
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "A2_ljdU-LORs",
"colab_type": "text"
},
"source": [
"## MAP"
]
},
{
"cell_type": "code",
"metadata": {
"id": "Za7As4DhLaif",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 519
},
"outputId": "7780927c-3cdf-4498-c0ab-a66177b3fee7"
},
"source": [
"%%time\n",
"\n",
"selected_users = []\n",
"actuals = []\n",
"predicts = []\n",
"list_size = [10,20,30,50,100] # Possible recommendation list sizes\n",
"mapk_results = []\n",
"user_amount = 611 # Amount of users for testing\n",
"\n",
"ignored = 0\n",
"\n",
"while len(selected_users) < user_amount:\n",
"\n",
" user = randint(0,610)\n",
"\n",
" data = movies.merge(ratings[ratings.userId == user], on='movieId',how='inner')\n",
"\n",
" unwatched = movies.merge(ratings[ratings.userId == user], on='movieId', how='left', indicator=True).query('_merge == \"left_only\"').drop('_merge', 1)\n",
" unwatched.drop(['userId','rating'],inplace=True,axis=1)\n",
"\n",
" Y = np.zeros((data.shape[0]), dtype=np.int32)\n",
" X = data.drop(['userId'],inplace=False,axis=1)\n",
"\n",
" # Splitting into liked (if rating is greater or equal to 4) and disliked movies\n",
" for index,rating in enumerate(X['rating'].tolist()):\n",
" if rating >= 4:\n",
" Y[index] = 1\n",
" else:\n",
" Y[index] = 0\n",
"\n",
" X['recommendation'] = Y\n",
"\n",
" if X.shape[0] < 10 or X.loc[X['recommendation'] == 1].shape[0] < 10:\n",
" # print('Error: small user repertoire')\n",
" ignored = ignored + 1\n",
" continue\n",
"\n",
" # Removing 10 movies for prediction\n",
" test = X.loc[X['recommendation'] == 1].sample(n=10,replace=False)\n",
" test.sort_values(by='rating',inplace=True,ascending=False)\n",
" test.drop('recommendation',inplace=True,axis=1)\n",
" train = X[~X['movieId'].isin(test['movieId'].tolist())].copy()\n",
"\n",
"# # Ignore user if the training set is too small\n",
" if train.shape[0] < 10:\n",
" ignored = ignored + 1\n",
" continue\n",
"\n",
" test.drop(['rating'],inplace=True,axis=1)\n",
" train.drop(['rating'],inplace=True,axis=1)\n",
"\n",
" trainY = np.asarray(train['recommendation'],dtype=np.int32)\n",
" trainX = train.drop('recommendation',inplace=False,axis=1)\n",
" \n",
" bnb = BernoulliNB(binarize=0.0)\n",
" bnb.fit(trainX.drop(['title','movieId'],inplace=False,axis=1), trainY)\n",
"\n",
" # Adding the 10 test movies to the unwatched movies for prediction\n",
" unwatched = unwatched.merge(test,on=unwatched.columns.to_list(),how='outer')\n",
" prediction = bnb.predict(unwatched.drop(['title','movieId'],inplace=False,axis=1))\n",
" proba = bnb.predict_proba(unwatched.drop(['title','movieId'],inplace=False,axis=1))\n",
"\n",
" probaN = []\n",
" probaP = []\n",
"\n",
" for line in proba:\n",
" probaN.append(line[0])\n",
" probaP.append(line[1])\n",
"\n",
" unwatched['Prediction'] = np.array(prediction)\n",
" unwatched['ProbaN'] = probaN\n",
" unwatched['ProbaP'] = probaP\n",
" unwatched.sort_values(by='ProbaP',inplace=True,ascending=False,na_position='first')\n",
"\n",
" actual = test['movieId'].values.tolist()\n",
" predicted = unwatched.loc[unwatched['Prediction']==1]\n",
" predicted = predicted['movieId'].values.tolist()\n",
"\n",
" actuals.append(actual)\n",
" predicts.append(predicted)\n",
"\n",
" selected_users.append(user)\n",
"\n",
"for k in list_size:\n",
" evaluation = ml_metrics.mapk(actuals,predicts,k=k)\n",
" mapk_results.append(evaluation)\n",
"\n",
"display(pd.DataFrame(data=list(zip(['MAP@10','MAP@20','MAP@30','MAP@50','MAP@100'],mapk_results)),columns=['Method','Score']))\n",
"\n",
"plt.plot(list_size,mapk_results)\n",
"plt.xlabel('Recommendation list size')\n",
"plt.ylabel('MAP')\n",
"plt.title('Mean Average Precision')"
],
"execution_count": null,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Method</th>\n",
" <th>Score</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>MAP@10</td>\n",
" <td>0.001801</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>MAP@20</td>\n",
" <td>0.002131</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>MAP@30</td>\n",
" <td>0.002374</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>MAP@50</td>\n",
" <td>0.002586</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>MAP@100</td>\n",
" <td>0.002836</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Method Score\n",
"0 MAP@10 0.001801\n",
"1 MAP@20 0.002131\n",
"2 MAP@30 0.002374\n",
"3 MAP@50 0.002586\n",
"4 MAP@100 0.002836"
]
},
"metadata": {
"tags": []
}
},
{
"output_type": "stream",
"text": [
"CPU times: user 1min 41s, sys: 1min, total: 2min 42s\n",
"Wall time: 1min 28s\n"
],
"name": "stdout"
},
{
"output_type": "display_data",
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {
"tags": [],
"needs_background": "light"
}
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "wB5HWzaI8Btr",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 52
},
"outputId": "4487cda9-5a10-4458-b8db-ac87a0c17bfb"
},
"source": [
"!jupyter nbconvert --to html \"Content-based Filtering.ipynb\" "
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"text": [
"[NbConvertApp] Converting notebook Content-based Filtering.ipynb to html\n",
"[NbConvertApp] Writing 488011 bytes to Content-based Filtering.html\n"
],
"name": "stdout"
}
]
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment