Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save tatamiya/28431be826bcb6d0e94756ff8bac427e to your computer and use it in GitHub Desktop.
Save tatamiya/28431be826bcb6d0e94756ff8bac427e to your computer and use it in GitHub Desktop.
Display Feature Importances in a Recommendation Algorithm (LightFM)
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"[What's This Notebook?]\n",
"\n",
"- Display feature importances in a recommendation algorithm.\n",
" - linear coefficients base\n",
" - purmutation importances\n",
"\n",
"[Settings]\n",
"- Algorithm: LightFM\n",
" - https://github.com/lyst/lightfm\n",
" - https://tatamiya-practice.hatenablog.com/entry/2020/03/21/203332\n",
"- Data: MovieLens\n",
" - https://grouplens.org/datasets/movielens/"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/tatamiya/.pyenv/versions/3.7.4/envs/recommend/lib/python3.7/site-packages/lightfm/_lightfm_fast.py:9: UserWarning: LightFM was compiled without OpenMP support. Only a single thread will be used.\n",
" warnings.warn('LightFM was compiled without OpenMP support. '\n"
]
}
],
"source": [
"from lightfm import LightFM, evaluation\n",
"import pandas as pd\n",
"import numpy as np\n",
"import scipy.sparse as sp"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'1.0.1'"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.__version__"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>movieId</th>\n",
" <th>title</th>\n",
" <th>genres</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>Toy Story (1995)</td>\n",
" <td>Adventure|Animation|Children|Comedy|Fantasy</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>Jumanji (1995)</td>\n",
" <td>Adventure|Children|Fantasy</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>Grumpier Old Men (1995)</td>\n",
" <td>Comedy|Romance</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>Waiting to Exhale (1995)</td>\n",
" <td>Comedy|Drama|Romance</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>5</td>\n",
" <td>Father of the Bride Part II (1995)</td>\n",
" <td>Comedy</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" movieId title \\\n",
"0 1 Toy Story (1995) \n",
"1 2 Jumanji (1995) \n",
"2 3 Grumpier Old Men (1995) \n",
"3 4 Waiting to Exhale (1995) \n",
"4 5 Father of the Bride Part II (1995) \n",
"\n",
" genres \n",
"0 Adventure|Animation|Children|Comedy|Fantasy \n",
"1 Adventure|Children|Fantasy \n",
"2 Comedy|Romance \n",
"3 Comedy|Drama|Romance \n",
"4 Comedy "
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_movies = pd.read_csv('data/ml-latest-small/movies.csv')\n",
"df_movies.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Dataset\n",
"- create user-item interaction matrix"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>userId</th>\n",
" <th>movieId</th>\n",
" <th>rating</th>\n",
" <th>timestamp</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>4.0</td>\n",
" <td>964982703</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>4.0</td>\n",
" <td>964981247</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>6</td>\n",
" <td>4.0</td>\n",
" <td>964982224</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>47</td>\n",
" <td>5.0</td>\n",
" <td>964983815</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1</td>\n",
" <td>50</td>\n",
" <td>5.0</td>\n",
" <td>964982931</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" userId movieId rating timestamp\n",
"0 1 1 4.0 964982703\n",
"1 1 3 4.0 964981247\n",
"2 1 6 4.0 964982224\n",
"3 1 47 5.0 964983815\n",
"4 1 50 5.0 964982931"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_ratings = pd.read_csv('data/ml-latest-small/ratings.csv')\n",
"df_ratings.head()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"class DataSet():\n",
" def __init__(self, df, col_user, col_item, col_interaction, item_master=None, user_master=None):\n",
" self.col_user, self.col_item, self.col_interaction = col_user, col_item, col_interaction\n",
" \n",
" if user_master is None:\n",
" self.user2index, self.index2user = self._label_indexer(df[self.col_user])\n",
" else:\n",
" self.user2index, self.index2user = self._label_indexer(user_master[self.col_user])\n",
" \n",
" if item_master is None:\n",
" self.item2index, self.index2item = self._label_indexer(df[self.col_item])\n",
" else:\n",
" self.item2index, self.index2item = self._label_indexer(item_master[self.col_item])\n",
" \n",
" self.n_users, self.n_items = len(self.index2user), len(self.index2item)\n",
" \n",
" data = df[self.col_interaction].values\n",
" row_ind = df[self.col_user].map(self.user2index).values\n",
" col_ind = df[self.col_item].map(self.item2index).values\n",
" self.ui_matrix = sp.csr_matrix((data, (row_ind, col_ind)))\n",
" \n",
" \n",
" def _label_indexer(self, X):\n",
" label2index = {label: i for i, label in enumerate(X.unique())}\n",
" index2label = {i: label for label, i in label2index.items()}\n",
" \n",
" return label2index, index2label\n"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"dataset = DataSet(df_ratings, 'userId', 'movieId', 'rating', item_master=df_movies)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"csr_ui_matrix = dataset.ui_matrix"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"9742"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dataset.n_items"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"610"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dataset.n_users"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Item Features\n",
"\n",
"Create item features from the movie master:\n",
"\n",
"- year and recency\n",
"- genre"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>movieId</th>\n",
" <th>title</th>\n",
" <th>genres</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>Toy Story (1995)</td>\n",
" <td>Adventure|Animation|Children|Comedy|Fantasy</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>Jumanji (1995)</td>\n",
" <td>Adventure|Children|Fantasy</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>Grumpier Old Men (1995)</td>\n",
" <td>Comedy|Romance</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>Waiting to Exhale (1995)</td>\n",
" <td>Comedy|Drama|Romance</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>5</td>\n",
" <td>Father of the Bride Part II (1995)</td>\n",
" <td>Comedy</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" movieId title \\\n",
"0 1 Toy Story (1995) \n",
"1 2 Jumanji (1995) \n",
"2 3 Grumpier Old Men (1995) \n",
"3 4 Waiting to Exhale (1995) \n",
"4 5 Father of the Bride Part II (1995) \n",
"\n",
" genres \n",
"0 Adventure|Animation|Children|Comedy|Fantasy \n",
"1 Adventure|Children|Fantasy \n",
"2 Comedy|Romance \n",
"3 Comedy|Drama|Romance \n",
"4 Comedy "
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_movies.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Genres"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"df_movies_tmp = df_movies.copy()\n",
"df_movies_tmp['genres_split'] = df_movies['genres'].str.split('|')\n",
"df_genres_explode = df_movies_tmp[['movieId', 'genres_split']].explode('genres_split')\n",
"\n",
"df_genres_explode['values'] = 1\n",
"df_genres_pivot = df_genres_explode.pivot_table(index='movieId', columns='genres_split', values='values', aggfunc='count', fill_value=0)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th>genres_split</th>\n",
" <th>(no genres listed)</th>\n",
" <th>Action</th>\n",
" <th>Adventure</th>\n",
" <th>Animation</th>\n",
" <th>Children</th>\n",
" <th>Comedy</th>\n",
" <th>Crime</th>\n",
" <th>Documentary</th>\n",
" <th>Drama</th>\n",
" <th>Fantasy</th>\n",
" <th>Film-Noir</th>\n",
" <th>Horror</th>\n",
" <th>IMAX</th>\n",
" <th>Musical</th>\n",
" <th>Mystery</th>\n",
" <th>Romance</th>\n",
" <th>Sci-Fi</th>\n",
" <th>Thriller</th>\n",
" <th>War</th>\n",
" <th>Western</th>\n",
" </tr>\n",
" <tr>\n",
" <th>movieId</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
"genres_split (no genres listed) Action Adventure Animation Children \\\n",
"movieId \n",
"1 0 0 1 1 1 \n",
"2 0 0 1 0 1 \n",
"3 0 0 0 0 0 \n",
"4 0 0 0 0 0 \n",
"5 0 0 0 0 0 \n",
"\n",
"genres_split Comedy Crime Documentary Drama Fantasy Film-Noir Horror \\\n",
"movieId \n",
"1 1 0 0 0 1 0 0 \n",
"2 0 0 0 0 1 0 0 \n",
"3 1 0 0 0 0 0 0 \n",
"4 1 0 0 1 0 0 0 \n",
"5 1 0 0 0 0 0 0 \n",
"\n",
"genres_split IMAX Musical Mystery Romance Sci-Fi Thriller War Western \n",
"movieId \n",
"1 0 0 0 0 0 0 0 0 \n",
"2 0 0 0 0 0 0 0 0 \n",
"3 0 0 0 1 0 0 0 0 \n",
"4 0 0 0 1 0 0 0 0 \n",
"5 0 0 0 0 0 0 0 0 "
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_genres_pivot.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Year and Recency\n",
"- Extract release year from the movie title.\n",
" - in parantheses\n",
" - If no release year, then set it as 1900 tentatively.\n",
"- Scale it into [0, 1] range and call it 'recency'"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"df_movies_tmp['year'] = df_movies['title'].str.extract('.*\\((.*)\\).*')"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>movieId</th>\n",
" <th>title</th>\n",
" <th>genres</th>\n",
" <th>genres_split</th>\n",
" <th>year</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>6059</th>\n",
" <td>40697</td>\n",
" <td>Babylon 5</td>\n",
" <td>Sci-Fi</td>\n",
" <td>[Sci-Fi]</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9031</th>\n",
" <td>140956</td>\n",
" <td>Ready Player One</td>\n",
" <td>Action|Sci-Fi|Thriller</td>\n",
" <td>[Action, Sci-Fi, Thriller]</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9091</th>\n",
" <td>143410</td>\n",
" <td>Hyena Road</td>\n",
" <td>(no genres listed)</td>\n",
" <td>[(no genres listed)]</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9138</th>\n",
" <td>147250</td>\n",
" <td>The Adventures of Sherlock Holmes and Doctor W...</td>\n",
" <td>(no genres listed)</td>\n",
" <td>[(no genres listed)]</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9179</th>\n",
" <td>149334</td>\n",
" <td>Nocturnal Animals</td>\n",
" <td>Drama|Thriller</td>\n",
" <td>[Drama, Thriller]</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9259</th>\n",
" <td>156605</td>\n",
" <td>Paterson</td>\n",
" <td>(no genres listed)</td>\n",
" <td>[(no genres listed)]</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9367</th>\n",
" <td>162414</td>\n",
" <td>Moonlight</td>\n",
" <td>Drama</td>\n",
" <td>[Drama]</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9448</th>\n",
" <td>167570</td>\n",
" <td>The OA</td>\n",
" <td>(no genres listed)</td>\n",
" <td>[(no genres listed)]</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9514</th>\n",
" <td>171495</td>\n",
" <td>Cosmos</td>\n",
" <td>(no genres listed)</td>\n",
" <td>[(no genres listed)]</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9515</th>\n",
" <td>171631</td>\n",
" <td>Maria Bamford: Old Baby</td>\n",
" <td>(no genres listed)</td>\n",
" <td>[(no genres listed)]</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9518</th>\n",
" <td>171749</td>\n",
" <td>Death Note: Desu nôto (2006–2007)</td>\n",
" <td>(no genres listed)</td>\n",
" <td>[(no genres listed)]</td>\n",
" <td>2006–2007</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9525</th>\n",
" <td>171891</td>\n",
" <td>Generation Iron 2</td>\n",
" <td>(no genres listed)</td>\n",
" <td>[(no genres listed)]</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9611</th>\n",
" <td>176601</td>\n",
" <td>Black Mirror</td>\n",
" <td>(no genres listed)</td>\n",
" <td>[(no genres listed)]</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" movieId title \\\n",
"6059 40697 Babylon 5 \n",
"9031 140956 Ready Player One \n",
"9091 143410 Hyena Road \n",
"9138 147250 The Adventures of Sherlock Holmes and Doctor W... \n",
"9179 149334 Nocturnal Animals \n",
"9259 156605 Paterson \n",
"9367 162414 Moonlight \n",
"9448 167570 The OA \n",
"9514 171495 Cosmos \n",
"9515 171631 Maria Bamford: Old Baby \n",
"9518 171749 Death Note: Desu nôto (2006–2007) \n",
"9525 171891 Generation Iron 2 \n",
"9611 176601 Black Mirror \n",
"\n",
" genres genres_split year \n",
"6059 Sci-Fi [Sci-Fi] NaN \n",
"9031 Action|Sci-Fi|Thriller [Action, Sci-Fi, Thriller] NaN \n",
"9091 (no genres listed) [(no genres listed)] NaN \n",
"9138 (no genres listed) [(no genres listed)] NaN \n",
"9179 Drama|Thriller [Drama, Thriller] NaN \n",
"9259 (no genres listed) [(no genres listed)] NaN \n",
"9367 Drama [Drama] NaN \n",
"9448 (no genres listed) [(no genres listed)] NaN \n",
"9514 (no genres listed) [(no genres listed)] NaN \n",
"9515 (no genres listed) [(no genres listed)] NaN \n",
"9518 (no genres listed) [(no genres listed)] 2006–2007 \n",
"9525 (no genres listed) [(no genres listed)] NaN \n",
"9611 (no genres listed) [(no genres listed)] NaN "
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_movies_tmp[~(df_movies_tmp['year'].str.isdecimal() == True)]"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"df_movies_tmp['year'] = df_movies_tmp['year'].replace({'2006–2007': '2006'}).fillna('1900').astype(int)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"count 9742.000000\n",
"mean 1994.498255\n",
"std 18.818171\n",
"min 1900.000000\n",
"25% 1987.000000\n",
"50% 1999.000000\n",
"75% 2008.000000\n",
"max 2018.000000\n",
"Name: year, dtype: float64"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_movies_tmp['year'].describe()"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<matplotlib.axes._subplots.AxesSubplot at 0x11e203f10>"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAY4AAAD4CAYAAAD7CAEUAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjAsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy8GearUAAAV+ElEQVR4nO3dfbRddX3n8fcHggg+lFBiBgM26IQ6MAWkEZg+TFUWz6sN1urCas1CVuNahbV0jZ0h2K6B1mEtdKx0mFpWack0WBXxqaaFlgmMU8bp8JAgAgGRKw8lMUIkVERbEPqdP87vlkN6783Z4Z57zk3er7XOunt/92/v8/vdneST/XD2SVUhSdKg9hp1ByRJ84vBIUnqxOCQJHVicEiSOjE4JEmdLBh1B4bhoIMOqqVLl466G5I0r2zcuPG7VbVoZ+12y+BYunQpGzZsGHU3JGleSfLwIO08VSVJ6sTgkCR1YnBIkjoxOCRJnRgckqRODA5JUicGhySpE4NDktSJwSFJ6mS3/OS4JAEsXX3tyN77oUvOGNl7D5tHHJKkTgwOSVInBockqRODQ5LUicEhSerE4JAkdWJwSJI6MTgkSZ0YHJKkTgwOSVInBockqRODQ5LUicEhSerE4JAkdWJwSJI6MTgkSZ0MLTiSvDTJrUm+nmRTkt9p9cOS3JJkIslnk7yk1fdt8xNt+dK+bV3Q6vclOWVYfZYk7dwwjzieBt5SVUcDxwCnJjkB+AhwaVX9a+AJ4JzW/hzgiVa/tLUjyRHAWcCRwKnAHybZe4j9liTNYGjBUT1Ptdl92quAtwCfb/W1wJltekWbpy0/MUla/eqqerqqHgQmgOOG1W9J0syGeo0jyd5J7gAeA9YD3wL+vqqebU02A0va9BLgEYC2/HvAj/fXp1in/71WJdmQZMO2bduGMRxJEkMOjqp6rqqOAQ6hd5Tw+iG+1xVVtbyqli9atGhYbyNJe7w5uauqqv4e+Arw74ADkixoiw4BtrTpLcChAG35jwGP99enWEeSNMeGeVfVoiQHtOn9gJOAe+kFyK+0ZiuBL7fpdW2etvx/VVW1+lntrqvDgGXArcPqtyRpZgt23mSXHQysbXdA7QVcU1V/meQe4Ook/wX4GnBla38l8MkkE8B2endSUVWbklwD3AM8C5xbVc8Nsd+SpBkMLTiq6k7gDVPUH2CKu6Kq6h+Bt0+zrYuBi2e7j5Kk7vzkuCSpE4NDktSJwSFJ6sTgkCR1YnBIkjoxOCRJnRgckqRODA5JUicGhySpk2E+ckSS9lhLV187kvd96JIzhv4eHnFIkjoxOCRJnRgckqRODA5JUicGhySpE4NDktSJwSFJ6sTgkCR1YnBIkjoxOCRJnRgckqRODA5JUidDC44khyb5SpJ7kmxK8v5WvyjJliR3tNfpfetckGQiyX1JTumrn9pqE0lWD6vPkqSdG+bTcZ8FPlhVtyd5BbAxyfq27NKq+lh/4yRHAGcBRwKvBm5Icnhb/AngJGAzcFuSdVV1zxD7LkmaxtCCo6q2Alvb9PeT3AssmWGVFcDVVfU08GCSCeC4tmyiqh4ASHJ1a2twSNIIzMk1jiRLgTcAt7TSeUnuTLImycJWWwI80rfa5labrr7je6xKsiHJhm3bts3yCCRJk4YeHEleDnwB+EBVPQlcDrwOOIbeEcnvzcb7VNUVVbW8qpYvWrRoNjYpSZrCUL8BMMk+9ELjU1X1RYCqerRv+R8Df9lmtwCH9q1+SKsxQ12SNMeGeVdVgCuBe6vq4331g/uavRW4u02vA85Ksm+Sw4BlwK3AbcCyJIcleQm9C+jrhtVvSdLMhnnE8bPArwF3Jbmj1T4EvDPJMUABDwHvA6iqTUmuoXfR+1ng3Kp6DiDJecD1wN7AmqraNMR+S5JmMMy7qr4KZIpF182wzsXAxVPUr5tpPUnS3PGT45KkTgwOSVInBockqRODQ5LUicEhSerE4JAkdWJwSJI6MTgkSZ0YHJKkTgwOSVInBockqRODQ5LUicEhSerE4JAkdWJwSJI6MTgkSZ0YHJKkTgwOSVInAwVHkp8adkckSfPDoEccf5jk1iS/keTHhtojSdJYGyg4qurngXcBhwIbk3w6yUlD7ZkkaSwNfI2jqu4Hfhs4H/gF4LIk30jyy8PqnCRp/Ax6jeOoJJcC9wJvAX6xqv5Nm750mnUOTfKVJPck2ZTk/a1+YJL1Se5vPxe2epJclmQiyZ1Jju3b1srW/v4kK1/kmCVJL8KgRxz/HbgdOLqqzq2q2wGq6tv0jkKm8izwwao6AjgBODfJEcBq4MaqWgbc2OYBTgOWtdcq4HLoBQ1wIXA8cBxw4WTYSJLm3qDBcQbw6ar6B4AkeyXZH6CqPjnVClW1tS9gvk/vaGUJsAJY25qtBc5s0yuAq6rnZuCAJAcDpwDrq2p7VT0BrAdO7ThOSdIsGTQ4bgD265vfv9UGkmQp8AbgFmBxVW1ti74DLG7TS4BH+lbb3GrT1Xd8j1VJNiTZsG3btkG7JknqaNDgeGlVPTU506b3H2TFJC8HvgB8oKqe7F9WVQXUgH2YUVVdUVXLq2r5okWLZmOTkqQpDBocP9jhYvVPA/+ws5WS7EMvND5VVV9s5UfbKSjaz8dafQu9230nHdJq09UlSSMwaHB8APhckv+T5KvAZ4HzZlohSYArgXur6uN9i9YBk3dGrQS+3Fd/T7u76gTge+2U1vXAyUkWtoviJ7eaJGkEFgzSqKpuS/J64Cdb6b6q+tFOVvtZ4NeAu5Lc0WofAi4BrklyDvAw8I627DrgdGAC+CFwdnvv7Uk+DNzW2v1uVW0fpN+SpNk3UHA0bwSWtnWOTUJVXTVd46r6KpBpFp84RfsCzp1mW2uANR36KkkakoGCI8kngdcBdwDPtXIB0waHJGn3NOgRx3LgiHZUIEnagw16cfxu4F8NsyOSpPlh0COOg4B7ktwKPD1ZrKpfGkqvJElja9DguGiYnZAkzR+D3o77N0l+AlhWVTe051TtPdyuSZLG0aCPVf914PPAH7XSEuDPh9UpSdL4GvRU1bn0Hml+C/S+1CnJq4bWK0m7laWrrx11FzSLBr2r6umqemZyJskCZunhhJKk+WXQ4PibJB8C9mvfNf454C+G1y1J0rgaNDhWA9uAu4D30Xuu1HTf/CdJ2o0NelfVPwF/3F6SpD3YoM+qepAprmlU1WtnvUeSpLHW5VlVk14KvB04cPa7I0kadwNd46iqx/teW6rq94Ezhtw3SdIYGvRU1bF9s3vROwLp8l0ekqTdxKD/+P9e3/SzwEM8/819kqQ9yKB3Vb152B2RJM0Pg56q+g8zLa+qj89OdyRJ467LXVVvBNa1+V8EbgXuH0anJEnja9DgOAQ4tqq+D5DkIuDaqnr3sDomSRpPgz5yZDHwTN/8M60mSdrDDBocVwG3JrmoHW3cAqydaYUka5I8luTuvtpFSbYkuaO9Tu9bdkGSiST3JTmlr35qq00kWd1pdJKkWTfoXVUXJ/kr4Odb6eyq+tpOVvtT4A/ohU6/S6vqY/2FJEcAZwFHAq8GbkhyeFv8CeAkYDNwW5J1VXXPIP2WJM2+QY84APYHnqyq/wZsTnLYTI2r6iZg+4DbXgFcXVVPV9WDwAS9L446Dpioqgfa94Fc3dpKkkZk0K+OvRA4H7iglfYB/mwX3/O8JHe2U1kLW20J8Ehfm82tNl1dkjQigx5xvBX4JeAHAFX1beAVu/B+lwOvA44BtvLCT6S/KElWJdmQZMO2bdtma7OSpB0MGhzPVFXRHq2e5GW78mZV9WhVPdf3/R7HtUVbgEP7mh7SatPVp9r2FVW1vKqWL1q0aFe6J0kawKDBcU2SPwIOSPLrwA3swpc6JTm4b/atwOQdV+uAs5Ls266dLKP3AcPbgGVJDkvyEnoX0NchSRqZnd5VlSTAZ4HXA08CPwn856pav5P1PgO8CTgoyWbgQuBNSY6hd+TyEL2voaWqNiW5BriH3kMUz62q59p2zgOuB/YG1lTVpu7DlCTNlp0GR1VVkuuq6qeAGcNih/XeOUX5yhnaXwxcPEX9OnrfcS5JGgODnqq6Pckbh9oTSdK8MOizqo4H3p3kIXp3VoXewchRw+qYJGk8zRgcSV5TVX8HnDJTO0nSnmNnRxx/Tu+puA8n+UJVvW0uOiVJGl87u8aRvunXDrMjkqT5YWfBUdNMS5L2UDs7VXV0kifpHXns16bh+Yvjrxxq7yRJY2fG4KiqveeqI5Kk+aHLY9UlSTI4JEndGBySpE4MDklSJwaHJKkTg0OS1InBIUnqxOCQJHVicEiSOjE4JEmdGBySpE4MDklSJwaHJKkTg0OS1MnQgiPJmiSPJbm7r3ZgkvVJ7m8/F7Z6klyWZCLJnUmO7VtnZWt/f5KVw+qvJGkwwzzi+FPg1B1qq4Ebq2oZcGObBzgNWNZeq4DLoRc0wIXA8cBxwIWTYSNJGo2hBUdV3QRs36G8AljbptcCZ/bVr6qem4EDkhwMnAKsr6rtVfUEsJ5/GUaSpDk019c4FlfV1jb9HWBxm14CPNLXbnOrTVeXJI3IyC6OV1UBNVvbS7IqyYYkG7Zt2zZbm5Uk7WCug+PRdgqK9vOxVt8CHNrX7pBWm67+L1TVFVW1vKqWL1q0aNY7LknqmevgWAdM3hm1EvhyX/097e6qE4DvtVNa1wMnJ1nYLoqf3GqSpBFZMKwNJ/kM8CbgoCSb6d0ddQlwTZJzgIeBd7Tm1wGnAxPAD4GzAapqe5IPA7e1dr9bVTtecJckzaGhBUdVvXOaRSdO0baAc6fZzhpgzSx2TZL0IvjJcUlSJwaHJKkTg0OS1InBIUnqxOCQJHVicEiSOjE4JEmdGBySpE4MDklSJwaHJKkTg0OS1MnQnlUlabwsXX3tqLug3YRHHJKkTgwOSVInBockqRODQ5LUicEhSerE4JAkdWJwSJI6MTgkSZ0YHJKkTgwOSVInBockqZORBEeSh5LcleSOJBta7cAk65Pc334ubPUkuSzJRJI7kxw7ij5LknpGecTx5qo6pqqWt/nVwI1VtQy4sc0DnAYsa69VwOVz3lNJ0j8bp1NVK4C1bXotcGZf/arquRk4IMnBo+igJGl0wVHA/0yyMcmqVltcVVvb9HeAxW16CfBI37qbW+0FkqxKsiHJhm3btg2r35K0xxvV93H8XFVtSfIqYH2Sb/QvrKpKUl02WFVXAFcALF++vNO6kqTBjSQ4qmpL+/lYki8BxwGPJjm4qra2U1GPteZbgEP7Vj+k1aR5yS9U0nw356eqkrwsySsmp4GTgbuBdcDK1mwl8OU2vQ54T7u76gTge32ntCRJc2wURxyLgS8lmXz/T1fVXye5DbgmyTnAw8A7WvvrgNOBCeCHwNlz32VJ0qQ5D46qegA4eor648CJU9QLOHcOuiZJGsA43Y4rSZoHDA5JUicGhySpE4NDktSJwSFJ6sTgkCR1YnBIkjoZ1bOqpJHysR/SrvOIQ5LUicEhSerE4JAkdeI1Do2U1xqk+ccjDklSJwaHJKkTg0OS1InBIUnqxOCQJHVicEiSOjE4JEmd+DkOAX6eQtLgPOKQJHVicEiSOpk3wZHk1CT3JZlIsnrU/ZGkPdW8CI4kewOfAE4DjgDemeSI0fZKkvZM8+Xi+HHARFU9AJDkamAFcM8w3swLxZI0vfkSHEuAR/rmNwPH9zdIsgpY1WafSnLfi3i/g4Dvvoj1x8nuNBbYvcazO40FHM9YyEemLA86lp8Y5D3mS3DsVFVdAVwxG9tKsqGqls/GtkZtdxoL7F7j2Z3GAo5nnM32WObFNQ5gC3Bo3/whrSZJmmPzJThuA5YlOSzJS4CzgHUj7pMk7ZHmxamqqno2yXnA9cDewJqq2jTEt5yVU15jYncaC+xe49mdxgKOZ5zN6lhSVbO5PUnSbm6+nKqSJI0Jg0OS1MkeERxJ1iR5LMndfbWjk/y/JHcl+Yskr+xbdkF7tMl9SU7pq4/FY0+6jCfJSUk2tvrGJG/pW+enW30iyWVJMs5j6Vv+miRPJfnNvtq82zdt2VFt2aa2/KWtPvJ903U8SfZJsrbV701yQd86I98/SQ5N8pUk97Tf9/tb/cAk65Pc334ubPW03/1EkjuTHNu3rZWt/f1JVs6DsbyrjeGuJH+b5Oi+bXXfN1W127+Afw8cC9zdV7sN+IU2/V7gw236CODrwL7AYcC36F2Q37tNvxZ4SWtzxDwYzxuAV7fpfwts6VvnVuAEIMBfAaeN81j6ln8e+Bzwm21+vu6bBcCdwNFt/seBvcdl3+zCeH4VuLpN7w88BCwdl/0DHAwc26ZfAXyz/X3/KLC61VcDH2nTp7fffdq+uKXVDwQeaD8XtumFYz6Wn5nsI71HN02OZZf2zR5xxFFVNwHbdygfDtzUptcDb2vTK+j94X+6qh4EJug98uSfH3tSVc8Ak489mXNdxlNVX6uqb7f6JmC/JPsmORh4ZVXdXL0/QVcBZw6/9y/Ucd+Q5EzgQXpjmTQv9w1wMnBnVX29rft4VT03Lvum9anLeAp4WZIFwH7AM8CTjMn+qaqtVXV7m/4+cC+9p1KsANa2Zmt5/ne9Ariqem4GDmj75hRgfVVtr6on6P0OTp3DoXQeS1X9besrwM30PgsHu7hv9ojgmMYmnv8FvZ3nP2A41eNNlsxQHxfTjaff24Dbq+ppen3f3LdsnMYz5ViSvBw4H/idHdrP131zOFBJrk9ye5L/1OrjvG9g+vF8HvgBsBX4O+BjVbWdMdw/SZbSOxq/BVhcVVvbou8Ai9v0vPi3YMCx9DuH3pEU7OJY9uTgeC/wG0k20jvUe2bE/XmxZhxPkiOBjwDvG0HfuppuLBcBl1bVU6Pq2C6abjwLgJ8D3tV+vjXJiaPpYifTjec44Dng1fRO834wyWtH08Xptf+AfAH4QFU92b+sHeHNm88odB1LkjfTC47zX8z7zosPAA5DVX2D3qkCkhwOnNEWzfR4k7F97MkM4yHJIcCXgPdU1bdaeQvPH67CGI1nhrEcD/xKko8CBwD/lOQfgY3Mz32zGbipqr7bll1H73rCnzGm+wZmHM+vAn9dVT8CHkvyf4Hl9P5HOxb7J8k+9P6h/VRVfbGVH01ycFVtbaeiHmv16f4t2AK8aYf6/x5mv6fScSwkOQr4E3rXyx5v5V17nNNcXtAZ5YveRbr+C3yvaj/3oncO+b1t/kheeHH8AXoXkBa06cN4/iLSkfNgPAe0vv7yFNvY8QLs6eM8lh3WuYjnL47P132zELid3oXkBcANwBnjtG86jud84H+06ZfR+9qDo8Zl/7Tf5VXA7+9Q/6+88ILyR9v0Gbzw4vitrX4gvetsC9vrQeDAMR/La+hdr/2ZHdrv0r4ZyR/EEfyB+Qy9864/ove/vHOA99O7E+GbwCW0T9G39r9F706D++i7m4XeXRbfbMt+az6MB/hteued7+h7Tf7FXw7c3cbzB/2/g3Ecyw7rXUQLjvm6b1r7d9O7ZnD35F/ycdk3u/Bn7eX07nbbRC80/uM47R96pwOL3p1sk38XTqd3N9uNwP30wvvA1j70vkDuW8BdwPK+bb2X3j/EE8DZ82AsfwI80dd2w4vZNz5yRJLUyZ58cVyStAsMDklSJwaHJKkTg0OS1InBIUnqxOCQJHVicEiSOvn/B9GUPOrAFtMAAAAASUVORK5CYII=\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"df_movies_tmp['year'].plot.hist()"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.preprocessing import MinMaxScaler"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"mmscaler = MinMaxScaler()\n",
"df_movies_tmp['recency'] = mmscaler.fit_transform(df_movies_tmp[['year']])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## merge"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"df_item_features = pd.merge(df_genres_pivot, df_movies_tmp[['movieId', 'recency']], on='movieId').set_index('movieId')"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>(no genres listed)</th>\n",
" <th>Action</th>\n",
" <th>Adventure</th>\n",
" <th>Animation</th>\n",
" <th>Children</th>\n",
" <th>Comedy</th>\n",
" <th>Crime</th>\n",
" <th>Documentary</th>\n",
" <th>Drama</th>\n",
" <th>Fantasy</th>\n",
" <th>...</th>\n",
" <th>Horror</th>\n",
" <th>IMAX</th>\n",
" <th>Musical</th>\n",
" <th>Mystery</th>\n",
" <th>Romance</th>\n",
" <th>Sci-Fi</th>\n",
" <th>Thriller</th>\n",
" <th>War</th>\n",
" <th>Western</th>\n",
" <th>recency</th>\n",
" </tr>\n",
" <tr>\n",
" <th>movieId</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.805085</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.805085</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.805085</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.805085</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.805085</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 21 columns</p>\n",
"</div>"
],
"text/plain": [
" (no genres listed) Action Adventure Animation Children Comedy \\\n",
"movieId \n",
"1 0 0 1 1 1 1 \n",
"2 0 0 1 0 1 0 \n",
"3 0 0 0 0 0 1 \n",
"4 0 0 0 0 0 1 \n",
"5 0 0 0 0 0 1 \n",
"\n",
" Crime Documentary Drama Fantasy ... Horror IMAX Musical \\\n",
"movieId ... \n",
"1 0 0 0 1 ... 0 0 0 \n",
"2 0 0 0 1 ... 0 0 0 \n",
"3 0 0 0 0 ... 0 0 0 \n",
"4 0 0 1 0 ... 0 0 0 \n",
"5 0 0 0 0 ... 0 0 0 \n",
"\n",
" Mystery Romance Sci-Fi Thriller War Western recency \n",
"movieId \n",
"1 0 0 0 0 0 0 0.805085 \n",
"2 0 0 0 0 0 0 0.805085 \n",
"3 0 1 0 0 0 0 0.805085 \n",
"4 0 1 0 0 0 0 0.805085 \n",
"5 0 0 0 0 0 0 0.805085 \n",
"\n",
"[5 rows x 21 columns]"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_item_features.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Create a Sparce Matrix"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.base import BaseEstimator, TransformerMixin"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
"def df2sparse(df, index, columns, value, shape):\n",
" data = df[value]\n",
" row_ind = df[index]\n",
" col_ind = df[columns]\n",
" return sp.csr_matrix((data, (row_ind, col_ind)), shape=shape)"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
"class ItemFeatureConverter(BaseEstimator, TransformerMixin):\n",
" def __init__(self, dataset):\n",
" self.dataset = dataset\n",
" \n",
" def fit(self, df, y=None):\n",
" \n",
" self.ifeat2index = {col: i for i, col in enumerate(df.columns)}\n",
" self.labels_item_features = list(self.ifeat2index.keys())\n",
" self.n_item_features = len(self.ifeat2index)\n",
" \n",
" return self\n",
" \n",
" def transform(self, df):\n",
" col_item = self.dataset.col_item\n",
" \n",
" columns_target = df.columns[df.columns.isin(self.labels_item_features)]\n",
"\n",
" df_stacked = df[columns_target].stack().reset_index()\n",
" df_stacked.columns = [col_item, 'label_feature', 'value']\n",
" #df_stacked = df_stacked[df_stacked['value']!=0]\n",
" \n",
" df_stacked['item_index'] = df_stacked[col_item].map(dataset.item2index)\n",
" df_stacked['feature_index'] = df_stacked['label_feature'].map(self.ifeat2index)\n",
"\n",
" csr_item_features = df2sparse(df_stacked, 'item_index', 'feature_index', 'value',\n",
" shape=(dataset.n_items, self.n_item_features))\n",
" \n",
" return csr_item_features"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [],
"source": [
"ifeat_converter = ItemFeatureConverter(dataset)"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [],
"source": [
"csr_item_features = ifeat_converter.fit_transform(df_item_features)"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"21"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ifeat_converter.n_item_features"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<9742x21 sparse matrix of type '<class 'numpy.float64'>'\n",
"\twith 204582 stored elements in Compressed Sparse Row format>"
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"csr_item_features"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Modeling\n",
"- Using LightFM"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [],
"source": [
"from lightfm import cross_validation"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [],
"source": [
"csr_ui_matrix_train, csr_ui_matrix_test = cross_validation.random_train_test_split(csr_ui_matrix)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Make a Class"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.base import ClassifierMixin"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [],
"source": [
"class LightFMClassifier(BaseEstimator, ClassifierMixin):\n",
" def __init__(self, feature_type=None, **params):\n",
" self.model = LightFM(**params)\n",
" if feature_type in ['user', 'item', None]:\n",
" self.feature_type = feature_type\n",
" else:\n",
" raise ValueError('Invalid feature_type: ', feature_type)\n",
" \n",
" def fit(self, features, ui_matrix, epochs=10):\n",
" user_features, item_features = None, None\n",
" if self.feature_type == 'user':\n",
" user_features = features\n",
" elif self.feature_type == 'item':\n",
" item_features = features\n",
" \n",
" self.model.fit(ui_matrix,\n",
" user_features=user_features,\n",
" item_features=item_features,\n",
" epochs=epochs)\n",
" \n",
" return self\n",
" \n",
" def predict(self, features):\n",
" item_bias, item_embeddings = self.model.get_item_representations()\n",
" \n",
" recommend_score = item_bias * csr_item_features.T\n",
" return 1/(1+np.exp(-recommend_score))\n",
" \n",
" def score(self, features, ui_matrix):\n",
" user_features, item_features = None, None\n",
" if self.feature_type == 'user':\n",
" user_features = features\n",
" elif self.feature_type == 'item':\n",
" item_features = features\n",
" \n",
" score = evaluation.auc_score(self.model, ui_matrix,\n",
" user_features=user_features,\n",
" item_features=item_features).mean()\n",
" return score\n",
" \n",
" @property\n",
" def feature_importances_(self):\n",
" _, user_embeddings = self.model.get_user_representations()\n",
" item_bias, item_embeddings = self.model.get_item_representations()\n",
" \n",
" item_coefs = np.dot(user_embeddings, item_embeddings.T) + item_bias \n",
" return np.abs(item_coefs).mean(0)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## fit and score"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.pipeline import Pipeline"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [],
"source": [
"pipe_lfm = Pipeline([('item_features', ItemFeatureConverter(dataset)),\n",
" ('lfm', LightFMClassifier(feature_type='item', no_components=5))])"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Pipeline(memory=None,\n",
" steps=[('item_features',\n",
" ItemFeatureConverter(dataset=<__main__.DataSet object at 0x11a1d2d90>)),\n",
" ('lfm', LightFMClassifier(feature_type='item'))],\n",
" verbose=False)"
]
},
"execution_count": 35,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pipe_lfm.fit(df_item_features, csr_ui_matrix_train)"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.597118"
]
},
"execution_count": 36,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pipe_lfm.score(df_item_features, csr_ui_matrix_test)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Feature Importances"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Based on Embedding Features"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [],
"source": [
"labels_item_features = pipe_lfm.named_steps['item_features'].labels_item_features"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [],
"source": [
"importance = pipe_lfm.named_steps['lfm'].feature_importances_"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<matplotlib.axes._subplots.AxesSubplot at 0x10d82ad50>"
]
},
"execution_count": 39,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"pd.Series(importance, index=labels_item_features).sort_values().plot.barh()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Permutation Importance"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.inspection import permutation_importance"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [],
"source": [
"result = permutation_importance(pipe_lfm, df_item_features, csr_ui_matrix_test, n_repeats=8, n_jobs=4)"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [],
"source": [
"import matplotlib.pyplot as plt\n",
"%matplotlib inline"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"sorted_idx = result.importances_mean.argsort()\n",
"\n",
"fig, ax = plt.subplots()\n",
"ax.boxplot(result.importances[sorted_idx].T,\n",
" vert=False, labels=df_item_features.columns[sorted_idx])\n",
"ax.set_title(\"Permutation Importances (train set)\")\n",
"fig.tight_layout()\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.4"
},
"toc": {
"base_numbering": 1,
"nav_menu": {},
"number_sections": true,
"sideBar": true,
"skip_h1_title": false,
"title_cell": "Table of Contents",
"title_sidebar": "Contents",
"toc_cell": false,
"toc_position": {},
"toc_section_display": true,
"toc_window_display": false
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment