Skip to content

Instantly share code, notes, and snippets.

@pilipolio
Created October 17, 2016 13:17
Show Gist options
  • Save pilipolio/5b6a373f8be14ed9035a22272bfb766c to your computer and use it in GitHub Desktop.
Save pilipolio/5b6a373f8be14ed9035a22272bfb766c to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"--2016-10-17 14:09:06-- http://files.grouplens.org/datasets/movielens/ml-1m.zip\n",
"Resolving files.grouplens.org... 128.101.34.146\n",
"Connecting to files.grouplens.org|128.101.34.146|:80... connected.\n",
"HTTP request sent, awaiting response... 200 OK\n",
"Length: 5917549 (5.6M) [application/zip]\n",
"Saving to: 'ml-1m.zip.1'\n",
"\n",
"ml-1m.zip.1 100%[=====================>] 5.64M 550KB/s in 21s \n",
"\n",
"2016-10-17 14:09:27 (279 KB/s) - 'ml-1m.zip.1' saved [5917549/5917549]\n",
"\n",
"Archive: ml-1m.zip\n",
" inflating: ./ml-1m/movies.dat \n",
" inflating: ./ml-1m/ratings.dat \n",
" inflating: ./ml-1m/README \n",
" inflating: ./ml-1m/users.dat \n"
]
}
],
"source": [
"! wget http://files.grouplens.org/datasets/movielens/ml-1m.zip\n",
"! unzip ml-1m.zip -d ."
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/gui/.virtualenvs/gui3/lib/python3.5/site-packages/ipykernel/__main__.py:3: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.\n",
" app.launch_new_instance()\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>user</th>\n",
" <th>item</th>\n",
" <th>rating</th>\n",
" <th>timestamp</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>1193</td>\n",
" <td>5</td>\n",
" <td>978300760</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>661</td>\n",
" <td>3</td>\n",
" <td>978302109</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>914</td>\n",
" <td>3</td>\n",
" <td>978301968</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>3408</td>\n",
" <td>4</td>\n",
" <td>978300275</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1</td>\n",
" <td>2355</td>\n",
" <td>5</td>\n",
" <td>978824291</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" user item rating timestamp\n",
"0 1 1193 5 978300760\n",
"1 1 661 3 978302109\n",
"2 1 914 3 978301968\n",
"3 1 3408 4 978300275\n",
"4 1 2355 5 978824291"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import pandas as pd\n",
"\n",
"ratings = pd.read_csv('./ml-1m/ratings.dat', sep='::', names=['user', 'item', 'rating', 'timestamp'])\n",
"ratings.head()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>user_support</th>\n",
" <th>item_support</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td>6040.000000</td>\n",
" <td>3706.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mean</th>\n",
" <td>165.597517</td>\n",
" <td>269.889099</td>\n",
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
" <td>192.747029</td>\n",
" <td>384.047838</td>\n",
" </tr>\n",
" <tr>\n",
" <th>min</th>\n",
" <td>20.000000</td>\n",
" <td>1.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25%</th>\n",
" <td>44.000000</td>\n",
" <td>33.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50%</th>\n",
" <td>96.000000</td>\n",
" <td>123.500000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75%</th>\n",
" <td>208.000000</td>\n",
" <td>350.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>max</th>\n",
" <td>2314.000000</td>\n",
" <td>3428.000000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" user_support item_support\n",
"count 6040.000000 3706.000000\n",
"mean 165.597517 269.889099\n",
"std 192.747029 384.047838\n",
"min 20.000000 1.000000\n",
"25% 44.000000 33.000000\n",
"50% 96.000000 123.500000\n",
"75% 208.000000 350.000000\n",
"max 2314.000000 3428.000000"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"item_supports = ratings.groupby('item')['user'].nunique().to_frame('item_support')\n",
"item_supports.describe()\n",
"\n",
"user_supports = ratings.groupby('user')['item'].nunique().to_frame('user_support')\n",
"user_supports.describe()\n",
"\n",
"pd.concat([user_supports.describe(), item_supports.describe()], axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/gui/.virtualenvs/gui3/lib/python3.5/site-packages/ipykernel/__main__.py:1: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.\n",
" if __name__ == '__main__':\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>title</th>\n",
" <th>genres</th>\n",
" <th>first_genre</th>\n",
" </tr>\n",
" <tr>\n",
" <th>item</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Toy Story (1995)</td>\n",
" <td>Animation|Children's|Comedy</td>\n",
" <td>Animation</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Jumanji (1995)</td>\n",
" <td>Adventure|Children's|Fantasy</td>\n",
" <td>Adventure</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Grumpier Old Men (1995)</td>\n",
" <td>Comedy|Romance</td>\n",
" <td>Comedy</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Waiting to Exhale (1995)</td>\n",
" <td>Comedy|Drama</td>\n",
" <td>Comedy</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>Father of the Bride Part II (1995)</td>\n",
" <td>Comedy</td>\n",
" <td>Comedy</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" title genres \\\n",
"item \n",
"1 Toy Story (1995) Animation|Children's|Comedy \n",
"2 Jumanji (1995) Adventure|Children's|Fantasy \n",
"3 Grumpier Old Men (1995) Comedy|Romance \n",
"4 Waiting to Exhale (1995) Comedy|Drama \n",
"5 Father of the Bride Part II (1995) Comedy \n",
"\n",
" first_genre \n",
"item \n",
"1 Animation \n",
"2 Adventure \n",
"3 Comedy \n",
"4 Comedy \n",
"5 Comedy "
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"movies = pd.read_csv('./ml-1m/movies.dat', sep='::', names=['item', 'title', 'genres'], index_col='item')\n",
"movies['first_genre'] = movies.genres.str.split('|').str.get(0)\n",
"\n",
"movies.head()"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/gui/.virtualenvs/gui3/lib/python3.5/site-packages/ipykernel/__main__.py:1: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.\n",
" if __name__ == '__main__':\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>gender</th>\n",
" <th>age</th>\n",
" <th>occupation</th>\n",
" <th>zipcode</th>\n",
" </tr>\n",
" <tr>\n",
" <th>user</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>F</td>\n",
" <td>1</td>\n",
" <td>10</td>\n",
" <td>48067</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>M</td>\n",
" <td>56</td>\n",
" <td>16</td>\n",
" <td>70072</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>M</td>\n",
" <td>25</td>\n",
" <td>15</td>\n",
" <td>55117</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>M</td>\n",
" <td>45</td>\n",
" <td>7</td>\n",
" <td>02460</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>M</td>\n",
" <td>25</td>\n",
" <td>20</td>\n",
" <td>55455</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" gender age occupation zipcode\n",
"user \n",
"1 F 1 10 48067\n",
"2 M 56 16 70072\n",
"3 M 25 15 55117\n",
"4 M 45 7 02460\n",
"5 M 25 20 55455"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# See http://files.grouplens.org/datasets/movielens/ml-1m-README.txt for more details\n",
"users = pd.read_csv('./ml-1m/users.dat', sep='::', names=['user', 'gender', 'age', 'occupation', 'zipcode'], index_col='user')\n",
"users.head()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.0"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment