Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save friso/11282071 to your computer and use it in GitHub Desktop.
Save friso/11282071 to your computer and use it in GitHub Desktop.
{
"metadata": {
"name": "",
"signature": "sha256:fffb04a8605ccdef5610b121eeb72faf2288494ec4a8be9dc5e177d0432910b2"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Movielens\n",
"We'll use [the movielens dataset](http://grouplens.org/datasets/movielens/) which contains 10M movie ratings from about 70K users.\n",
"\n",
"To grab the data and upload to HDFS with a small block size (in order to have many splits):\n",
"\n",
" curl -O http://files.grouplens.org/datasets/movielens/ml-10m.zip\n",
" unzip ml-10m.zip\n",
" cd ml-10M100K/\n",
" hadoop fs -Ddfs.block.size=10240000 -put ratings.dat /\n",
"\n",
"Let's take a look..."
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"%load_ext rmagic\n",
"%pylab inline\n",
"%R library(ggplot2)\n",
"%R library(ggthemes)\n",
"\n",
"import numpy as np\n",
"import pandas as pd"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"Populating the interactive namespace from numpy and matplotlib\n"
]
}
],
"prompt_number": 1
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"filename = '/ratings.dat'\n",
"\n",
"# Read file and convert to proper types\n",
"converters = [int, int, float, int]\n",
"ratings = sc.textFile(filename).map(lambda line: [ func(d) for func,d in zip(converters, line.strip().split('::')) ] )\n",
"\n",
"# Keep ratings in memory\n",
"ratings.cache()\n",
"\n",
"ratings.count()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 2,
"text": [
"10000054"
]
}
],
"prompt_number": 2
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# Histogram # ratings per user\n",
"counts_per_user = ratings\\\n",
" .keyBy(lambda (user, movie, rating, ts): user)\\\n",
" .groupByKey()\\\n",
" .map(lambda (user, ratings): len(ratings))\n",
"\n",
"histogram_per_user = counts_per_user\\\n",
" .map(lambda c: (c - c % 50, 1))\\\n",
" .reduceByKey(lambda x, y: x + y)\\\n",
" .collect()\n",
"\n",
"histogram_per_user_df = pd.DataFrame(histogram_per_user, columns = ['ratings', 'frequency'])\n",
"histogram_per_user_df.sort(inplace=True, columns='ratings')\n",
"histogram_per_user_df.describe()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>ratings</th>\n",
" <th>frequency</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td> 71.000000</td>\n",
" <td> 71.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mean</th>\n",
" <td> 1957.042254</td>\n",
" <td> 984.197183</td>\n",
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
" <td> 1481.029945</td>\n",
" <td> 3814.385053</td>\n",
" </tr>\n",
" <tr>\n",
" <th>min</th>\n",
" <td> 0.000000</td>\n",
" <td> 1.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25%</th>\n",
" <td> 875.000000</td>\n",
" <td> 2.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50%</th>\n",
" <td> 1750.000000</td>\n",
" <td> 15.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75%</th>\n",
" <td> 2625.000000</td>\n",
" <td> 143.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>max</th>\n",
" <td> 7350.000000</td>\n",
" <td> 26270.000000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>8 rows \u00d7 2 columns</p>\n",
"</div>"
],
"metadata": {},
"output_type": "pyout",
"prompt_number": 3,
"text": [
" ratings frequency\n",
"count 71.000000 71.000000\n",
"mean 1957.042254 984.197183\n",
"std 1481.029945 3814.385053\n",
"min 0.000000 1.000000\n",
"25% 875.000000 2.000000\n",
"50% 1750.000000 15.000000\n",
"75% 2625.000000 143.000000\n",
"max 7350.000000 26270.000000\n",
"\n",
"[8 rows x 2 columns]"
]
}
],
"prompt_number": 3
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"%%R -w 1200 -i histogram_per_user_df\n",
"p = ggplot(head(histogram_per_user_df, 25), aes(x = factor(ratings), y = frequency)) + theme_economist() +\n",
" geom_bar(stat = 'identity') +\n",
" labs(title = '# ratings per user', x = '# of ratings', y = 'Frequency')\n",
"print(p)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "display_data",
"png": "iVBORw0KGgoAAAANSUhEUgAABLAAAAHgCAIAAAA69QPIAAAgAElEQVR4nO3daXxU5cE34DskgEIl\ngCKyCIIiKkrftrZFy6IiFhFEEXDHtVq1Lm1VbBUVd8UF3HCrC3WhUhcUFQEVFEVF0YqAoCxKgKei\nQBJ2QvJ+mOfNG03AJCZzgvd1feA3czI5939uJ2fO33PmTMasnOUBAACA+NRKOgAAAADJUAgBAAAi\npRACAABESiEEAACIlEIIAAAQKYUQAAAgUgohAABApBRCAACASCmEAAAAkVIIAQAAIqUQAgAAREoh\nBAAAiJRCCAAAECmFEAAAIFIKIQAAQKS2+UL4nxkfXHnJRcuW5MybM/vKSy5a+MXnlV7V53PnfD53\nzuaCgtTdDi2bdGjZZOWKFVWUFAAAoGbZ5gvhm69NfHb0k9kNG73z5uRnnnqi/g47VHpVR3XvelT3\nrnl5eam7zVvu2rzlrrVqZVRRUgAAgJolK+kAP9acT2e2btO2Xv36cz6d2XinnZrs3LSq1vz0y5NC\nCDs0yK6qFQIAANQo2/ARwmHXXtWhZZMpr01ctGB+h5ZNxj337xXffLPvrjsvWby4+DErV6xInfn5\nyUcf9ury20fuu2fFN98MufjCg365b8fWu/xmrzZnHt9/zqczNxcUdGjZJPUrnTu2/8+MD1I3Onds\nn7tqVfFKXhv/8u8P/NV+rZr273nIrE/+k3p8UVHRwyPv7tyx/YH77jnhpRdLnmj632VLLz73Dwfu\nu+d+rZoe8uufXz/kb/n5eSWfQvGa//3U47/du23nju3vuf2WwsLC1E+/WrTw3FNP/FW7Vvvv2frC\nM09NPa/Sz6jMFZZ5d0t5KjEQAADwE7ANF8LGO+7YslXr1I1Wu7UJIWQ3bNRm9z2yapdx2PPPZ5/x\n5cIFIYTBF5zz7OgnN23a9LtuBzdr3nLaW1P+dPrJhUWFbXbfI/XIXVvvVrfudmWOeOmfzq5X/2fb\n16s359OZl/7p7KKiohDCs6OfvO36oXl5eS1btbr8L+cXP7ioqOhPp5/8ygvPN23W7JDf9wohPPnI\nQ5ddcG6Za75xyN9at9k9Lzf33tuH/fOh+0MI+fl5pw446s3XJnbq3HX/3x4wafxLp/Q/smSfLH5G\n5bSlPFU+EAAAsK3YhgvhGedecMmVQ0MIIx58bOgtt4cQrr/9znFTpjXdpVnpBx/Wq/ek9z4aeNIp\nDRs16tGr94NPjhk56ql//OuZEML/LF2yds2acVOmpR751Iuv7tVh3zJHvPMfo56bOOWpF8aHEBYt\nmL86Py+E8Mh9d4cQrrnljqdfnnTRZVcUP3j16vzZMz8JIVx9823DH3h49LhX+w44bseddipzzVdc\nf8vTL0+86qbbQghPPvqPEMK4Z8b8d9nS4089455HHr/vn6NPOO3MZUtynn96dOlnVM7p2lKeKh8I\nAADYVmTMylmedIZK6t/zkKU5ObmrVrZu0zY/L3fFt9+2bNV64EmDzjj3guLHrFyxonPH9iGED+Z9\nuX29eiGETZs2jnv2menT3l60YP7nc+esXbMmhDD1k7mNGjdOnVqZuh1CKL4bQkit5NPFX2dkZBQV\nFe27686pH9WtW/fX7XcLIUyfu6he/fr5ebmd9tkj9aOGjRod36fnzI9nhBAa77jjrw/43QFduvU+\nun8qxvfipX59dX7+b/dum7p73RWXjR3z/1tZSq++R//92pu+94xKKl5h6j9rybtFRUVl5rn28sGV\nGAgAAPgJ2IYvKrNowfx1a9eGEIpPaMz56ssV335b5oOLK83g8895ddwLv+50YK+j+rVrv9fpx/Yr\n/4gZGRnF/6akzhoNIdSqVSuEkJmZVfLB/3zuxbcnv/HWG69Nnfz6q+NeeHXcCy89/+yjY57f+vpT\nNm3cEELYsUmTBiWualOv/s9KP6MyFRUVpbrrD+ZpsvPOP2YgAABg27UNF8Lpcxf9br/2XQ859KY7\n7z2ia6fWbXe/99EnfvC3Xhv/SgjhutvvbNmq9Yz33yv9gJIl6gfVq19/56a7fP3f/5k0/qXeR/d/\n+YXnin+0fv36m6++omWrVkNuuKWoqOj9d6aefmy/j6a/X+Z6Jrz04lEDjxv/4tgQQqvd2tSrX3+3\n3fcIIQw4cdD5F18WQpg7e9b8z+cVf9BxSzIz//cc4K8WLWzdpu30aW//YJ6zLrioEgMBAAA/Adtw\nIfzvsqW5q1bus1/HNatXL1ow//C+R5fnt5q3bPnVooVnHNevdZvdP/5wemphqgRut91269evv3rw\nXy4ZMnTX1ruVZ20ZGRnHnnzqXbfedPmfLxj14P1zZ88q/lHdunU/mv7+048/9tr4V5o2a/bZrE9D\nCAd27Vbmeq69/NInH/3HZ7NmhhBOPfvcEEK/Y0947IGRD9x5x7w5s7OysiZPfDWE8PjzL289zw4N\nslu2ap3z1ZfH9urRfNdd58+b94N5KjcQAADwE7ANX1QmVWn23ne//73RYb/y/NaNw+/Zc6+9l+bk\nfLP86+H3P9wgu2EI4c3XJoYQzv3LJfXq13/ztUmr8/PLH+MPf7rw1LPP3b5eva8WLrjyxmHFyzMy\nMu5+5PHDjujz5cIFE18el7tq1ZHHDLxheNlf3vC3odcvWvDFDg2yLxx8eeryLc1atHzk6ef273Tg\n25Nff/O1ift3OvCxZ17o0PHnWw+TkZEx7J7799x7nzVrVq9ZvTp1rZ2t56ncQAAAwE/ANnxRmY0b\nN25Yv65e/Z8VFm5ev25dvXr1M7PSfcBzw4YN99x2Swjh972P7NDx5x9/MP3Eo3plZmZ+8PlXderU\n+cFf/941YAAAANJpGz5ltE6dOqnSlZmZWbv2D7evasrw7tQ3Z33y8aMP3NuseYulOYtDCIcd0ac8\nbRAAACBZ2/ApozVBRkbGPY8+3qvv0fXq1V+as7jJzk0HnXn2NcOGJ50LAADgh23Dp4wCAADwYzhC\nCAAAECmFEAAAIFIKIQAAQKQUQgAAgEgphAAAAJFSCAEAACKlEAIAAERKIQQAAIiUQggAABAphRAA\nACBSCiEAAECkFEIAAIBIKYQAAACRUggBAAAipRACAABESiEEAACIlEIIAAAQKYUQAAAgUllJB6hK\ng/r2TGroUWPHJzU0AABA5ThCCAAAECmFEAAAIFIKIQAAQKQUQgAAgEgphAAAAJFSCAEAACKlEAIA\nAERKIQQAAIiUQggAABAphRAAACBSCiEAAECkFEIAAIBIKYQAAACRUggBAAAipRACAABESiEEAACI\nlEIIAAAQKYUQAAAgUgohAABApBRCAACASCmEAAAAkVIIAQAAIqUQAgAAREohBAAAiJRCCAAAECmF\nEAAAIFIKIQAAQKQUQgAAgEgphAAAAJFSCAEAACKlEAIAAERKIQQAAIiUQggAABAphRAAACBSCiEA\nAECkFEIAAIBIKYQAAACRUggBAAAipRACAABESiEEAACIlEIIAAAQKYUQAAAgUgohAABApBRCAACA\nSCmEAAAAkVIIAQAAIqUQAgAAREohBAAAiJRCCAAAECmFEAAAIFIKIQAAQKQUQgAAgEgphAAAAJFS\nCAEAACKlEAIAAERKIQQAAIiUQggAABAphRAAACBSCiEAAECkFEIAAIBIKYQAAACRUggBAAAipRAC\nAABESiEEAACIlEIIAAAQKYUQAAAgUgohAABApBRCAACASCmEAAAAkVIIAQAAIqUQAgAAREohBAAA\niJRCCAAAECmFEAAAIFIKIQAAQKQUQgAAgEgphAAAAJFSCAEAACKlEAIAAERKIQQAAIiUQggAABAp\nhRAAACBSCiEAAECkFEIAAIBIVVkhXLVyxZnH9z+u9++P6t71nSlvhBCW5iw+uV/vAYd3P7lf72VL\nciq9BAAAgOpQZYVw1IP3dTm4++hxr9444p6///n8EMKwa6/q02/AmFdeO/KYgTcPHVLpJQAAAFSH\njFk5y6tkRe+8OXmP9nvt3HSX/1m65JjfH/L2zLkHdGj38lvvNWrceOWKFUd07fTOp/Mqt6TM4Wpl\nZGTXq/u9hX0OPahKnkslvDhpclJDAwAAbMmqteuLirb406yqGubArgeFED6a/v51Vwy+bOh1IYT8\nvNzshg1DCA2ys/NyV1V6SZkya9Vq1vBnVRX+x6tRYQAAAFJy120o2nIjrLJCWFCw6brLL1v57bfD\n7r6/bbs9Qwg7NMjOz8vNbtgo9W+ll5Rpc1Hh13lrqir8j1ejwgAAAKRs5fBgqMJC+NgD9+3SvMXV\nN99WvKRT5y6TJ03o2//YKZMmdOrcpdJLylRYWPRN/rqqCv/j1agwAAAA5VFlnyE8dcBRi+Z/UbtO\nndTdie/OWLYk528XnhcyMkIIN424Z5fmLSq3pPwZBvXtWSXPpRJGjR2f1NAAAACVU2WFsCZQCAEA\nAMrPF9MDAABESiEEAACIlEIIAAAQKYUQAAAgUgohAABApBRCAACASCmEAAAAkVIIAQAAIqUQAgAA\nREohBAAAiJRCCAAAECmFEAAAIFIKIQAAQKQUQgAAgEgphAAAAJFSCAEAACKlEAIAAERKIQQAAIiU\nQggAABAphRAAACBSCiEAAECkFEIAAIBIKYQAAACRUggBAAAipRACAABESiEEAACIlEIIAAAQKYUQ\nAAAgUgohAABApBRCAACASCmEAAAAkVIIAQAAIqUQAgAAREohBAAAiJRCCAAAECmFEAAAIFIKIQAA\nQKQUQgAAgEgphAAAAJFSCAEAACKlEAIAAERKIQQAAIhUVtIBfoIG9e2ZyLijxo5PZFwAAGAb5Qgh\nAABApBRCAACASCmEAAAAkVIIAQAAIqUQAgAAREohBAAAiJRCCAAAECmFEAAAIFIKIQAAQKQUQgAA\ngEgphAAAAJFSCAEAACKlEAIAAERKIQQAAIiUQggAABAphRAAACBSCiEAAECkFEIAAIBIKYQAAACR\nUggBAAAipRACAABESiEEAACIlEIIAAAQKYUQAAAgUgohAABApCpcCO+9fdiSxYurIwoAAADplFXR\nX2jStOkl5521fb16/Y47oXvPI7bbbrvqiAUAAEB1q/ARwgEnDnryhVcuGTL0obtHHPSLDkMvu/jL\nhQuqIxkAAADVqsJHCD98791xz/37PzM+6HJIj159j/7PjA9O7d/3jQ9nVkc4AAAAqk+FC+FzTz/Z\np9+AITfcUqtWrRBCu732Xr9uXTUEAwAAoHpV+JTRS4YM3bBhQ61atV58dkx+Xm5mZuYpZ51THckA\nAACoVhUvhOedvfLbb0IIa1av/us5f6iGSAAAAKRDhQvhmtWr+w44LoRw3KDTVufnV0MkAAAA0qHC\nhbCwsPD9d6YWFGz64L1pBQUF1ZEJAACANKjwRWWuvHHYkIsvnDdndpvd2113+4jqyAQAAEAaVLgQ\n7r3vfv8e/3p1RAEAACCdKlwIrx781zFPjCq+OytneZXmAQAAIE0qXAj/8+H06XMX1atfvzrSAAAA\nkDYVvqhM54O75yz+qqioqDrSAAAAkDYVPkL48Mi7Hx55d/Fdp4wCAABsoypcCDVAAACAn4YKnzIK\nAADAT0OFC+GcT2cO7NWjY+tdnnnqiWlvTamOTAAAAKRBhQvhdVcMvm3kg5s3b+7a/dARN99QHZkA\nAABIg8qcMrpr691CCE12bpqZlVnFcQAAAEiXChfC2rXrfDHvsxDCwvlfbC7YXA2RAAAASIcKX2X0\nqhuHXXHxhZlZWeeecsKNw++pjkwAAACkQYULYZs92j3x/MvVEQUAAIB0qnAh7NCyScm7vpYQAABg\nG1XJL6YvKir6/LM5z/3ryWqIBAAAQDpU8ovpMzIyWrZq/e7UN6s2DQAAAGnzo04ZHXDioCoNAwAA\nQPpU8pRRAAAAtnWVPGUUAACAbd2PvcpoisOGAAAA25wKHyEceNIp9/1z9EcLltz/+L/6HXfCrJzl\n2iAAAMC2qMKFcM6nM7sc3L1OnTqdDzpk7uzZ1ZEJAACANKhwISwo2DTltYmpfzdu2FAdmQAAAEiD\nCn+G8Jphw4dcfOH5p5/cuu3u1w4bXh2ZAAAASIMKF8J99uv4zKtvVEcUAAAA0qkynyEc2KtHx9a7\nPPPUE9PemlIdmQAAAEiDChfC664YfNvIBzdv3ty1+6Ejbr6hOjIBAACQBpX5YvpdW+8WQmiyc9PM\nrMwqjgMAAEC6VLgQ1q5d54t5n4UQFs7/YnPB5mqIBAAAQDpU+KIytevUuerSv2RmZZ17ygk3Dr+n\nOjIBAACQBhUuhO3a79W528GdunSrVasyp5sCAABQQ1S4ED72wMjHHhhZfHdWzvIqzQMAAECaVLgQ\naoAAAAA/DRU47bPfYQelbtx5i2+bAAAA2OZV5nOA9995R5XnAAAAIM1cGAYAACBSCiEAAECkKnBR\nmbmzZ3Vo2SR1u/iGa8wAAABsoypQCHU/AACAnxKnjAIAAERKIQQAAIiUQggAABAphRAAACBSCiEA\nAECkFEIAAIBIKYQAAACRUggBAAAipRACAABESiEEAACIlEIIAAAQKYUQAAAgUgohAABApBRCAACA\nSCmEAAAAkVIIAQAAIqUQAgAARKoqC+HmgoLe3Q4ovrs0Z/HJ/XoPOLz7yf16L1uSU+klAAAAVIcq\nK4Qvj33uqB7dFs7/onjJsGuv6tNvwJhXXjvymIE3Dx1S6SUAAABUh4xZOcurZEWbN28ORUUdd2tW\nvMIDOrR7+a33GjVuvHLFiiO6dnrn03mVW1LmcFmZtVrt2OB7Czsf0KlKnkslTJ32buIxSmYAAAAI\nISxanltYVLSln2ZV1TCZmZnfW5Kfl5vdsGEIoUF2dl7uqkovKVNGyNiudpWF//FqQpiakAEAAKhZ\nMkLYYh+sukJY2g4NsvPzcrMbNkr9W+klZdq0efPcZd9WX/iKqglhakIGAACgRiks3HIdrNZC2Klz\nl8mTJvTtf+yUSRM6de5S6SVbsnmrTyzNakKYmpABAADYhlTZZwhTOrRsUrzCZUty/nbheSEjI4Rw\n04h7dmneonJLyj/6oL49q/C5VMioseMTj1EyAwAAwA+q4kKYLIUwkXEBAIBtlC+mBwAAiJRCCAAA\nECmFEAAAIFIKIQAAQKQUQgAAgEgphAAAAJFSCAEAACKlEAIAAERKIQQAAIiUQggAABAphRAAACBS\nCiEAAECkFEIAAIBIKYQAAACRUggBAAAilZV0AKrLoL49Exl31NjxiYwLAABUlCOEAAAAkVIIAQAA\nIqUQAgAAREohBAAAiJRCCAAAECmFEAAAIFIKIQAAQKQUQgAAgEgphAAAAJFSCAEAACKlEAIAAERK\nIQQAAIiUQggAABAphRAAACBSCiEAAECkFEIAAIBIKYQAAACRUggBAAAipRACAABESiEEAACIlEII\nAAAQKYUQAAAgUgohAABApBRCAACASCmEAAAAkVIIAQAAIqUQAgAAREohBAAAiJRCCAAAECmFEAAA\nIFIKIQAAQKQUQgAAgEgphAAAAJFSCAEAACKlEAIAAERKIQQAAIiUQggAABAphRAAACBSCiEAAECk\nFEIAAIBIKYQAAACRUggBAAAipRACAABESiEEAACIlEIIAAAQKYUQAAAgUgohAABApBRCAACASCmE\nAAAAkVIIAQAAIqUQAgAAREohBAAAiJRCCAAAECmFEAAAIFIKIQAAQKQUQgAAgEhlJR2An7hBfXsm\nMu6oseMTGRcAALYhjhACAABESiEEAACIlEIIAAAQKYUQAAAgUgohAABApBRCAACASCmEAAAAkVII\nAQAAIqUQAgAAREohBAAAiJRCCAAAECmFEAAAIFIKIQAAQKQUQgAAgEgphAAAAJFSCAEAACKlEAIA\nAEQqK+kAUO0G9e2Z1NCjxo5PamgAAPhBjhACAABESiEEAACIlEIIAAAQKYUQAAAgUgohAABApBRC\nAACASCmEAAAAkVIIAQAAIqUQAgAAREohBAAAiJRCCAAAECmFEAAAIFIKIQAAQKQUQgAAgEgphAAA\nAJHKSjoAxGJQ355JDT1q7PikhgYAoCZzhBAAACBSCiEAAECkFEIAAIBIKYQAAACRUggBAAAipRAC\nAABEytdOQFyS+vYLX30BAFADOUIIAAAQKYUQAAAgUgohAABApBRCAACASCmEAAAAkVIIAQAAIuVr\nJ4AE+PYLAICawBFCAACASCmEAAAAkXLKKBAvZ64CAJFzhBAAACBSCiEAAECkFEIAAIBI+QwhQJKS\n+hxj8FFGAEAhBCC4vg4AxMopowAAAJFyhBCAmsKBSgBIM0cIAQAAIuUIIQB8R004UOlqQwCkR5UV\nwqU5iwdfcM76deu22377W+66r1mLllW1ZgAgEXopwE9elRXCYdde1affgIEnnTLmiVE3Dx0y/IFH\nqmrNAEDMasIx25oTA6BqVVkhfHfqW1feeGsI4dDDe99x43VVtdotyQihdlZmdY9SfnVqQJiakCGI\n8V1i1KgMQYzvqgkxakKGIMZ3iVGjMoRSMY47okciMUa/NDHxDDUkRskMNScGbMXGgs1b+WmVFcL8\nvNzshg1DCA2ys/NyV1XVarckKzNzj6aNqnuU8qsJYWpChiDGd4lRozIEMb6rJsSoCRmCGN8lRo3K\nEMT4rpoQoyZkCKVi7L///onE+OCDDxLPIMaWMhT7bNm3hYVFW/qVjFk5y6tk7AM6tBv/9vvZDRut\nWrniiK4HvD1zbpWsdkuyatVq0XiHah0CAABgW7f427zCoi0Wwio7Qtipc5fJkyb07X/slEkTOnXu\nUlWr3ZKCwsIvv8mt7lEAAAB+wqrsCOGyJTl/u/C8kJERQrhpxD27NG9RJasFAACgmlRZIQQAAGDb\nUivpAAAAACRDIQQAAIiUQggAABAphRAAACBSCiEAAECkFEIAAIBIKYQAAACRUggBAAAipRACAABE\nSiEMIYT169dfPfiv69evTzbGV4sWjrzj1mQzhBCmTn79peefSTpFePKRh2Z+PCPpFOGmqy7PXbUy\n2Qy5q1bedNXlyWYIIcz8eMaTjzyUdIrw0vPPTJ38etIpwsg7bv1q0cKkUwQbrmI2XCXZcBWz4SrJ\nhquYDVdJNlzFot1wKYQhhLBp08YxT4zatGljsjG++frrCS+9kGyGEMJns2bOeP+9pFOEqZNfrwnv\nW88/PXrtmjXJZli7Zs3zT49ONkMI4atFC2vCDs2M99/7bNbMpFOECS+98M3XXyedIthwFbPhKsmG\nq5gNV0k2XMVsuEqy4SoW7YZLIQQAAIiUQggAABCprKQDVNKmTRvXrV1bVWtbnZ8fQsjPzS0qLKyq\ndVbCmjX5mwsL83JXJZghhLB+/fqNGzckHqOgYNO6tWsTj1EUilbn5yUbY3V+XlEoSnwq1q1dW1Cw\nKfEYGzduWL9+feIxNhcWrlmTn3iMYMP1/9hwlWTDVcyGqyQbrmI2XCXZcBX7CW+4dmiQnZGRsaWf\nZszKWV6Fg6XN1Mmv33nLDVW1tqKiolUrVzRs1HgrM5UGBQUFa9esaZCdnWCGEML6desKCwvr1a+f\nbIzV+fl16tapU6dusjFWrVzRILthrVpJHksvLCzMy13VsFHjBDOEEDZu3LBxw8af7bBDsjHWrllT\nq1at7bbfPtkYebm59erXz8pK+P+prVzxrQ1Xig1XSTZcxWy4SrLhKmbDVZINV7Gf8Ibr0TFjt/JK\n21YLIQAAAD+SzxACAABESiEEAACIlEIIAAAQKYUQAAAgUgohAABApBRCAACASCmEYWnO4pP79R5w\nePeT+/VetiQnnUMv/nLR/nu27tHplz06/XLEzdenP8zmgoLe3Q4ovlt69PTk+V6MRKZl1coVZx7f\n/7jevz+qe9d3prxR5qDVHaN0hkSmoqBg0yXnnXXsEYcdfWjXcc/9u8xBE4mR4N/LnE9n/qpdq9Tt\npP5Mvhcj/bNRnhHTMBWlYyTywlj+9X/PPunYgb0OPeHIw5fmLC5z0ERipH82UmP16PTLQ3/7i5/v\n1rzMEdMwFaVjJLXFGDtmdN9Duhx5SOex//5XmYMmEiOR2Zg/b+7J/Xof36fnKcccmdSfSekMaZ6K\nSuxoVUeeH4yRnmn5XozyBKvyGD+YIZGpSHwv1PcQhj+fffoBXboNPOmUMU+MenvKG8MfeCRtQ7/1\nxmsfvvfuRZddnkiYl8c+N3L4rQs+n1f8Gig9ehrylI6RyLTcecsN2Q0bnXLWOXM+nXnOoOMnz/g0\n/bNROkMiUzFl0oTXX31l6LA7luYs7nfYwe/O/iKRF0bpGEn9vXyz/Ouhl138+quvpF6iicxG6Rjp\nn43yjJiGqSgdI5EXxp/PPr3rIT2OPvb4fz/1+PR33r75rpGJzEbpGAm+rYwe9cimTZtOPuOspP5G\nvhcjqanotPfu496cVlhYeOTBnZPafpaOkchsDOx16EWDLz+w28FTJk14fszoO+5/OP2zUTpDOqei\ncjtaVZ6nPDHSMC2lY6R/R7Q8GRKZisT3Qh0hDO9OfatHrz4hhEMP7/3+O2+nc+gvFy54Y8L4X7Rt\ncfShXed8OjPNYX7f+8jnJ04puaT06GnIUzpGItOyf6cDD+97dAihUePGmzZtKnPQ6o5ROkMiU9Hh\n5//n3L9cUlhYuGzJkgbZDcocNJEYiczGhg0brh78179fc0PxkkRmo3SM9M9GeUZMw1SUjpHIC+Od\nNyfP+uTjX7Rt8fC9d3Xr3qPMQROJkdTbyuIvF02ZNOHE084sc8S0vbWVjJHUVGQ3arg0Z/GyJTnZ\nDRuWOWgiMRKZjc9mz9r/gN+FEDp16fbe21PLHLS6Y5TOkM6pqNyOVpXnKU+MNExL6Rjp3xEtT4ZE\npiLxvVCFMOTn5aY2lw2ys/NyV6Vz6IaNGp1x3vnvfbbwyP7HXnnJRWkOk5mZmZmVVXJJ6dHTkKd0\njESm5cCuB+3cdJePpr9/3mknXTb0ujIHre4YpTMkMhU7Ndm5abPmfzrtpEHH9PnTxZeVOWgiMRKZ\njRuuuOyUP/yxWYuWxUsSmY3SMdI/G+UZMVeJTHEAAAcbSURBVA1TUTpGIi+MNatXt9i11btzFpxw\n2pmPPjCyzEETiZHU28rNQ4dcdNkVtWrVKnPEtL21lYyR1FRceuW1x/fpecKRh1965TVlDppIjERm\no/3e+zzz1OMbN258/B/3565aWeag1R2jdIZ0TkXldrSqPE95YqRhWkrHSP+OaHkyJDIVie+FKoRh\nhwbZ+Xm54X8nulE6h+59dP8jjxlYp06dE047c96cOcmGKXP0RPIkMi0FBZuuHvzXR++/d9jd9/fp\nN6DMQas7RukMiUzF5s2bCwsL73n0iX8+O27YNVeWOWgiMRKZjVfHvXDqgKM6tGwSQujQssmGDRsS\nmY3SMdI/G+UZMQ1TUTpGIi+MBtkNTzz9D3Xr1h1w4qC5s2eVOWgiMRKZjdkzPynYtKn9Ph1Sd5N6\nK/lejKTeYW+5ZsgTz7/8+HMv3XT1FWUOmkiMRGbj2ltHjH7skd+0b/M/y5bt0CC7zEGrO0bpDMnu\nepVnBtKQp/QQNWSPtCbsiMa5F6oQhk6du0yeNCGEMGXShE6du6Rz6Mv/cv57b78VQvjkow/b77NP\nsmHKHD2RPIlMy2MP3LdL8xYjHnq0bbs9U0vSPxulMyQyFQ+PvOvhkXdlZGQ0bda8sLCozEETiZHI\nbLw7Z/6snOWps/xn5SyvW7duIrNROkb6Z6M8I6ZhKkrHSOSF8dvfdZ725uQQwvRpb6caSCKzUTpG\nIrMx5olRRx4zsPhuUm8l34uR1Dts7spVLVq1atGqVX5eXpmDJhIjkdn4Yt5n9zz6+McLl+yzb8fO\nBx1c5qDVHaN0hmR3vcozA2nIU3qIGrJHWhN2ROPcC3VRmbBsSc7fLjwvZGSEEG4acc8uzVukbeiF\n87+44q8XFGwqyKqddfXNt7Vrv3f6w3Ro2aT4NVB69LTlKRkjkWk5dcBRi+Z/UbtOndTdie/OSP9s\nlM6QyFSsXLHikvPOystdtWnjxnP+fMlhR/RJ5IVROkayfy/FL9EE/0xKxkj/bJRnxDRMRekYibww\nluYsvvT8P27csKF2nTrX3HLH7nu2T2Q2SsdI/2wUFRUdsn/HZyZMbrzjjqkliUxF6RhJbTHGjhn9\nj5F3FxUVnXneBX37H5vUFuN7MRKZjXfenDzs2qtq167TeMcdr7v9zp2a7Jz+2SidIf1TUdEdrWrK\ns/UYaZuWkjHKE6w6Ymw9QyJTkfheqEIIAAAQKaeMAgAAREohBAAAiJRCCADhvhG3T538+qP33/vq\nuBe28rDzzxh0fJ+e/122tJyrHfXQ/akbqYvEAkBNoxACQPhs1sy9Ouw7b87sPffeZysPe/3VV54Y\n+3LTZs3Ludqbr74idWPG/JwfGxEAqoFCCEDsBp9/zuuvvtLtFx1efemF+4bfVrz80/98NLBXjxOO\nPPzYIw6b9cnHqYUX/uHU7x0h7NCyyS3XXNmhZZO+h3Q5sW+vvod0uXnokBDC+WcMCiGkHvzL3Vum\nHnnb9UOPPrTr0MsuDiHM+2zO8X16nnhUrxuv/HvqEOLoUY8M7NXjpKOP6HfYQf+49640PX8AIuYq\nowDE7r/Llg65+KI77n948AXn3P3wP4uXH9W96+Crrz2gS7d3prwx7Lqrn5s4pfT1ykMIHVo2efCJ\np6dOeeOIo47p0PHnuatWHrjvnqmHFT8+daNDyybPTZzSYtdWv9mrzayc5QN7HXrRZVcc2PWgaW9N\nOfP4/rNylv+qXat/vTRhjz33Wrd27ROPPHjmeRemcx4AiFBW0gEAIEkvPjvmsgvODSH8Zq82IYSL\nzjpt+AOPpH604It5vz7gwBDCrw/83fzP521lJZ26dFu3bt1jD4xs0LBh7aytvbe222vvjIyM1O3P\nZs/6zYG/CyHs3+mA1JI7H3rswbtGrF6dX7t27ZJfsw4A1UQhBCBqffoNWPHNN02bNc/Py83MzOp3\n3AnFP2qze7sP3p3WqXPX6dPe2X2PdltZSa1atS49/48Tp81ovNNOi79cVHwtmdKK22AIoe3u7aZP\ne+eALt1mvP9easkbE8Zfc+vwunXr5nz1Zd/uXT78/Ksf/fwAYGsUQgBiN3f2rO49j3jkvruPHXRa\nyeXX3T7i2r9fmpVVe/PmzdfdfufWV3LKH/545gn9d266S9t2e7Zrv/ej99976tnntm7TdtmSnGYt\nWpb5K9ffcefVg/9ap27djv/nl5lZWSGEVm3aHndEj/o77LBxw4aLr7i6ip4fAGyRzxACQDLuuvWm\njr/4VeduB7/y4vMP3DX8hdenJp0IgOg4QggAyehyUPfL/3L+eQsX7NZ296tuvu2HfwEAqpojhAAA\nAJHyPYQAAACRUggBAAAipRACAABESiEEAACIlEIIAAAQKYUQAAAgUgohAABApBRCAACASCmEAAAA\nkVIIAQAAIqUQAgAAREohBAAAiJRCCAAAEKn/C1QkLhHPZp39AAAAAElFTkSuQmCC\n"
}
],
"prompt_number": 4
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# Histogram # ratings per movie\n",
"counts_per_movie = ratings\\\n",
" .keyBy(lambda (user, movie, rating, ts): movie)\\\n",
" .groupByKey()\\\n",
" .map(lambda (movie, ratings): len(ratings))\n",
"\n",
"histogram_per_movie = counts_per_movie\\\n",
" .map(lambda c: (c - c % 400, 1))\\\n",
" .reduceByKey(lambda x,y: x+y)\\\n",
" .collect()\n",
"\n",
"histogram_per_movie_df = pd.DataFrame(histogram_per_movie, columns = ['ratings', 'frequency'])\n",
"histogram_per_movie_df.sort(inplace=True, columns = 'ratings')\n",
"histogram_per_movie_df.describe()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>ratings</th>\n",
" <th>frequency</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td> 72.000000</td>\n",
" <td> 72.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mean</th>\n",
" <td> 14855.555556</td>\n",
" <td> 148.291667</td>\n",
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
" <td> 9388.588952</td>\n",
" <td> 868.174147</td>\n",
" </tr>\n",
" <tr>\n",
" <th>min</th>\n",
" <td> 0.000000</td>\n",
" <td> 1.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25%</th>\n",
" <td> 7100.000000</td>\n",
" <td> 1.750000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50%</th>\n",
" <td> 14200.000000</td>\n",
" <td> 6.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75%</th>\n",
" <td> 22100.000000</td>\n",
" <td> 24.250000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>max</th>\n",
" <td> 34800.000000</td>\n",
" <td> 7315.000000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>8 rows \u00d7 2 columns</p>\n",
"</div>"
],
"metadata": {},
"output_type": "pyout",
"prompt_number": 5,
"text": [
" ratings frequency\n",
"count 72.000000 72.000000\n",
"mean 14855.555556 148.291667\n",
"std 9388.588952 868.174147\n",
"min 0.000000 1.000000\n",
"25% 7100.000000 1.750000\n",
"50% 14200.000000 6.000000\n",
"75% 22100.000000 24.250000\n",
"max 34800.000000 7315.000000\n",
"\n",
"[8 rows x 2 columns]"
]
}
],
"prompt_number": 5
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"%%R -w 1200 -i histogram_per_movie_df\n",
"p = ggplot(head(histogram_per_movie_df, 25), aes(x = factor(ratings), y = frequency)) + theme_economist() +\n",
" geom_bar(stat = 'identity') + \n",
" labs(title = '# ratings per movie', x = '# of ratings', y = 'Frequency')\n",
"print(p)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "display_data",
"png": "iVBORw0KGgoAAAANSUhEUgAABLAAAAHgCAIAAAA69QPIAAAgAElEQVR4nO3dZ3iUZd434CskhCYE\nVKRFEBRRUfbZXXdFBCxYkC4KthXr6ioqrg1dwYqVtaAiig1ZFddeUJGiYK9YEFAURAngiosQQHry\nfpjnycFLgiZhMpN4necHjpl7Zu7rN5P/kfjzvmcmY2bekgAAAEB8qqU7AAAAAOmhEAIAAERKIQQA\nAIiUQggAABAphRAAACBSCiEAAECkFEIAAIBIKYQAAACRUggBAAAipRACAABESiEEAACIlEIIAAAQ\nKYUQAAAgUgohAABApBRCAACASFWxQvjp9A8vv+i8xQvz5syedflF533z9Vfl3tVXX87+6svZGzds\nSFxtm9uwbW7Dn5YuTVLSKs8LAgAAv3lVrBC+PmXS0489mlO/wduvT31q3CN16tYt9676dOncp0vn\n/Pz8xNWmuTs2zd2xWrWMJCWt8rwgAADwm5eV7gBlM/vzGS1atqpdp87sz2dsu/32DXdolKw9P/7S\n5BBC3Xo5ydphVecFAQCA37wqc4Rw+DVXtM1tOG3KpPnz5rbNbTj+mSeX/vjjnjvusHDBgqL7/LR0\naeJEx88+/qhbp30evHvk0h9/HHrhoAP+sGe7Fo3/vFvL0449avbnMzZu2NA2t2HiIR3btfl0+oeJ\nCx3btVm+bFnRTqZMeOmwDn/cq3mjo7oeNPOzTxP3LywsfGDUnR3btemw564TX3xh0/Mq/7N40YVn\n/bXDnrvu1bzRQX/63bVDL12xIn/Tp1C05yfHPbzP7q06tmsz8pabCgoKErd+N/+bs046/o+tm++9\na4tBp52UeF7Fn1GJO5w84cVunfbZq3mjvoce8On0Dy8dNLDdTk0O67D3W9NeK4r9r/tHJ57OYR3+\n+K/7RxcWFoYQ/vH3c9rmNrzthmsTdxt73z1tcxuedeJxm74gW8oGAABUdVWmEG673Xa5zVskLjTf\nqWUIIad+g5Y775JVvYSDnH8/49Rvv5kXQhh87plPP/bo+vXr99v/wCZNc995Y9rZp5xQUFjQcudd\nEvfcscVONWrULHHFi88+o3adbWrVrj378xkXn31GokE9/dijN197VX5+fm7z5pedf07RnQsLC88+\n5YSXn3+2UZMmBx3WLYTw6IP3XXLuWSXu+fqhl7ZouXP+8uV33TL8X/fdE0JYsSL/pH59Xp8yqX3H\nznvvs+/kCS+eeFSvTftk0TMq0YVnnl6zVq2atWp9OWvmgL69Xpv0yvbbN8z77ttLBp2VKJyPjrn/\nhisu++H77/fZr9MP339/wxWXjRtzfwihe5++IYRpUyYm9vP2tNdCCF179dl057+aDQAAqKKqTCE8\n9axzL7r8qhDCiHsfuuqmW0II195y+/hp7zRq3KT4nQ/t1mPyex/3/8uJ9Rs0OKRbj3sffWLU2HH3\n//upEML3ixb+vGrV+GnvJO457oVXdmu7Z4kr3n7/2GcmTRv3/IQQwvx5c1euyA8hPHj3nSGEq2+6\n9fGXJp93yZCiO69cuWLWjM9CCFfeePNtox94bPwrvfsds93225e45yHX3vT4S5OuuOHmEMKjY+4P\nIYx/6on/LF507Emnjnzw4bv/9dhxJ5+2eGHes48/VvwZlbjDex99/OmJUx9+5sUQwoYN6x9+dvzT\nk6aGEJb++GP+8mUhhDH33BVCGHbL7feNe/Kam0eEEB669+4QQvv9Om273XZzZs/6ftHCdevWffDO\nW9nZ2QcdevimO//VbAAAQBVVZd5DeFTXgxbl5YUQhlxw7or85SGEG64cMu/rOaeedW7xO5978T9q\n1a4dQrjutjvHP/3Uw/ePnj9v7ldfzk7cWlBQWJoVO3Q+IITQqvWuiasbNmz8edWqb+Z+HUI4tHvP\nEEKvI/tdN/TSxK3bbFN3r//5w4xPph/X6/Btt9vuT/vut2+n/XsccVSJez6sR6/Ev5dfdF7ed9/+\nvGrVjE8/CSE8+uB9jz54X9HdPpv+YdEeip5RifZu3yGEsMP/deOdW7fJyPjfD4MpLAwrV6xYlLcg\nhJBoegce0jWEkFi3dp06h/XoPe6hB15/dXLznVquWbPm4K7dt/n/P6pnS9nCqaf/+osIAABUYlWm\nEM6fN3f1zz+HEIrOnMz77tul//1viXcu6k6DzznzlfHP/6l9h259+rZus9spR/ct/YqJTlXUrEII\nibNGQwjVqlULIWRmZm16538988JbU19747Upb0599ZXxz78y/vkXn316zBPP/vL+E9avWxtC2K5h\nw3qbfIhL7TrbFH9Gv7qr4lcTaYvfmrjQrXffcQ89MG3KpJ1b7xpCOLz3EZvt/FezAQAAVVSVKYQf\nfDl/v73adD7o4Btuv6t75/YtWu1815hHfvVRUya8HEIYdsvtuc1bTH//veJ3KOp4pVG7Tp0dGjX+\n4T/fT57wYo8jjnrp+WeKblqzZs2NVw7Jbd586HU3FRYWvv/2m6cc3ffjD94vcT8TX3yhT/9jJrzw\nXAih+U4ta9eps9POu4QQ+h0/4JwLLwkhfDlr5tyv5hS90XEr1a5Tp9mOOy5csODViS9373PkqxNf\nDiHkNm+RKJn/s/efGjdt9u6br383/5uatWrt3+WQzR5eodkAAIA0qjKF8D+LFy1f9tMee7VbtXLl\n/Hlzix/IKlHT3Nzv5n9z6jF9W7Tc+ZOPPkhsTJTAmjVrrlmz5srB51809KodW+xUmr1lZGQcfcJJ\nd/zzhsv+fu7Ye+/5ctbMoptq1Kjx8QfvP/7wQ1MmvNyoSZMvZn4eQujQef8S93PNZRc/Oub+L2bO\nCCGcdMZZIYS+Rx/30OhRo2+/dc7sWVlZWVMnvRJCePjZl0qTqjROPmPgsCGXDDn/3Gf+Pe6j994J\nIZxy5tmJm6pVq9at9xEPjLpz3ldzuvU+ovihyIrOBgAApEuV+VCZRMXafc+9/vdC271K86jrbxu5\n6267L8rL+3HJD7fd80C9nPohhNenTAohnHX+RbXr1Hl9yuSVK1aUPsZfzx500hln1apd+7tv5l1+\n/fCi7RkZGXc++PCh3Xt++828SS+NX75sWa8j+19328gSd3LpVdfOn/d13Xo5gwZflvicmCbNch98\n/Jm923d4a+qrr0+ZtHf7Dg899Xzbdr8rfbBfdsyJp1x69XU7NG78/jtv7dC4yZBhN2z6+TTdev/v\nmbRde/Yp/tiKzgYAAKRLxsy8JenOUCrr1q1bu2Z17TrbFBRsXLN6de3adTKzUn14c+3atSNvvimE\ncFiPXm3b/e6TDz84vk+3zMzMD7/6Ljs7+1cf/tPSpR3btQkhVJXXHAAA+G2rMqeMZmdnJ0pXZmZm\n9eq/3r4qKMO7b74+87NPxoy+q0nTZomP7jy0e8/StEEAAIDKpsqcMloZZGRkjBzzcLfeR9SuXWdR\n3oKGOzQacNoZVw+/Ld25AAAAyqPKnDIKAABAcjlCCAAAECmFEAAAIFIKIQAAQKQUQgAAgEgphAAA\nAJFSCAEAACKlEAIAAERKIQQAAIiUQggAABAphRAAACBSCiEAAECkFEIAAIBIKYQAAACRUggBAAAi\npRACAABESiEEAACIlEIIAAAQKYUQAAAgUlnpDlB+A3p3TdfSY5+bkK6lAQAAksURQgAAgEgphAAA\nAJFSCAEAACKlEAIAAERKIQQAAIiUQggAABAphRAAACBSCiEAAECkFEIAAIBIKYQAAACRUggBAAAi\npRACAABESiEEAACIlEIIAAAQKYUQAAAgUgohAABApBRCAACASCmEAAAAkVIIAQAAIqUQAgAAREoh\nBAAAiJRCCAAAECmFEAAAIFIKIQAAQKQUQgAAgEgphAAAAJFSCAEAACKlEAIAAERKIQQAAIiUQggA\nABAphRAAACBSCiEAAECkFEIAAIBIKYQAAACRUggBAAAipRACAABESiEEAACIlEIIAAAQKYUQAAAg\nUgohAABApBRCAACASCmEAAAAkVIIAQAAIqUQAgAAREohBAAAiJRCCAAAECmFEAAAIFIKIQAAQKQU\nQgAAgEgphAAAAJFSCAEAACKlEAIAAERKIQQAAIiUQggAABAphRAAACBSCiEAAECkFEIAAIBIKYQA\nAACRUggBAAAipRACAABESiEEAACIlEIIAAAQqaxk7WjJD/8Zcv65Py39b1ZW9X/eNbpp7o6L8hYM\nPvfMNatX16xV66Y77m7SLLc0W5KVBwAAgF+WtCOE1w29tGvPPo+/NLnvscePuPG6EMLwa67o2bff\nEy9P6XVk/xuvGlrKLQAAAKRGxsy8JUnZ0T67t+rZt99T4x5u0iz37AsGd+vTd9+2rV96470G2277\n09Kl3Tu3f/vzOaXZUnLKjIza2ZsfzDyya5ekJC+HpyZMSdfSAAAApffz2vWFW741aaeMrlq5stmO\nzd+dPe+JR8aOGT2qW5++K/KX59SvH0Kol5OTv3xZCKE0W0pOWa1ai+1zkhV161WqMAAAAFvyxeL/\nFhZssRImrRDWy6l//Cl/zc7O7nf8gOFXXxFCqFsvZ0X+8pz6DRL/lnJLiQoKC35csTpZUbdepQoD\nAACwJYW/cHwwiYVwn/06vvP61P0PPvSDd95qs0fbEEL7jp2mTp7Y+6ijp02e2L5jp1JuKdHGgsIf\n8lclK+rWq1RhAAAAyidp7yFclLfg4nP+tm7t2urZ2VffdOvOu7ZZvDDv0kEDQ0ZGCOGGESMbN21W\nmi2lX3FA765JSV4OY5+bkK6lAQAAkiVphTD1FEIAAICt4YvpAQAAIqUQAgAAREohBAAAiJRCCAAA\nECmFEAAAIFIKIQAAQKQUQgAAgEgphAAAAJFSCAEAACKlEAIAAERKIQQAAIiUQggAABAphRAAACBS\nCiEAAECkFEIAAIBIKYQAAACRUggBAAAipRACAABESiEEAACIlEIIAAAQKYUQAAAgUgohAABApBRC\nAACASCmEAAAAkVIIAQAAIqUQAgAAREohBAAAiJRCCAAAECmFEAAAIFIKIQAAQKQUQgAAgEgphAAA\nAJFSCAEAACKlEAIAAERKIQQAAIiUQggAABAphRAAACBSCiEAAECkFEIAAIBIKYQAAACRUggBAAAi\npRACAABESiEEAACIlEIIAAAQKYUQAAAgUgohAABApBRCAACASCmEAAAAkVIIAQAAIqUQAgAAREoh\nBAAAiJRCCAAAECmFEAAAIFIKIQAAQKQUQgAAgEgphAAAAJFSCAEAACKlEAIAAERKIQQAAIiUQggA\nABAphRAAACBSCiEAAECkFEIAAIBIKYQAAACRUggBAAAipRACAABESiEEAACIlEIIAAAQKYUQAAAg\nUgohAABApBRCAACASCmEAAAAkVIIAQAAIqUQAgAAREohBAAAiJRCCAAAECmFEAAAIFIKIQAAQKQU\nQgAAgEgphAAAAJFSCAEAACKlEAIAAERKIQQAAIiUQggAABAphRAAACBSCiEAAECkFEIAAIBIKYQA\nAACRUggBAAAipRACAABEqsyF8K5bhi9csKAiogAAAJBKWWV9QMNGjS4aeHqt2rX7HnNcl67da9as\nWRGxAAAAqGgZM/OWlONhX8z8/NJBZy1euPDw3kecdMZZLVq2SnqyXzWgd9fUL5ow9rkJ6VoaAAAg\nWcp8hPCj994d/8yTn07/sNNBh3TrfcSn0z886ajer300oyLCAQAAUHHKXAifefzRnn37Db3upmrV\nqoUQWu+2+5rVq4tunf35jL8c0f2jr74LISzKWzD43DPXrF5ds1atm+64u0mz3NJsSeJzAwAA4BeU\n+UNlLhp61dq1a6tVq/bC00+syF+emZl54ulnJm76cckPd906vKgfDr/mip59+z3x8pReR/a/8aqh\npdwCAABAapT5PYSnH9+/e5++vfsd89jYB1995eXRjzye2L527doLzjztsmuuP3if3yf2uW/b1i+9\n8V6Dbbf9aenS7p3bv/35nNJsKXHRzGoZO9Srs9nGQ/bvWPbnmxyTpr2ZrqUBAABK7/vlqwoLC7d0\na5lPGV21cmXvfseEEI4ZcPLzTz5etP26IZec+Ne/bXrO54r85Tn164cQ6uXk5C9fVsotJaqWUa1B\nnUr0caaVKgwAAMCW/Cd/1Zb7YNkLYUFBwftvv/mHP+/zyUcfbtiwoWj7K+Off3Lcw4nLbXMbTp+b\nV7dezor85Tn1GyT+DSGUZkuJNhQUfPvj8rJGrTiVKgwAAMCWFBZsuQ6WoxBefv3woRcOmjN7Vsud\nWw+7ZUTR9ndnz01caJvbMHHKaPuOnaZOntj7qKOnTZ7YvmOnUm4p+TkUFq5au76sUStOpQoDAABQ\nPuX8HsJfUFQIFy/Mu3TQwJCREUK4YcTIxk2blWZL6RfyPYQAAABbo8yF8MrBFzzxyNiiq0nvk6Wn\nEAIAAGyNMp8y+ulHH3zw5fzadTb/zE8AAACqljJ/D2HHA7vkLfjuFz63FAAAgCqhzEcIHxh15wOj\n7iy6msZTRgEAANgaZS6EGiAAAMBvQ5lPGQUAAOC3ocyFcPbnM/p3O6Rdi8ZPjXvknTemVUQmAAAA\nUqDMhXDYkME3j7p348aNnbscPOLG6yoiEwAAAClQnlNGd2yxUwih4Q6NMrMykxwHAACAVClzIaxe\nPfvrOV+EEL6Z+/XGDRsrIBIAAACpUOZPGb3i+uFDLhyUmZV11onHXX/byIrIBAAAQAqUuRC23KX1\nI8++VBFRAAAASKUyF8K2uQ03veprCQEAAKqocn4xfWFh4VdfzH7m349WQCQAAABSoZxfTJ+RkZHb\nvMW7b76e3DQAAACkzFadMtrv+AFJDQMAAEDqlPOUUQAAAKq6cp4yCgAAQFW3tZ8ymuCwIQAAQJVT\n5iOE/f9y4t3/euzjeQvvefjffY85bmbeEm0QAACgKipzIZz9+YxOB3bJzs7ueMBBX86aVRGZAAAA\nSIEyF8ING9ZPmzIp8e+6tWsrIhMAAAApUOb3EF49/LahFw4655QTWrTa+Zrht1VEJgAAAFKgzIVw\nj73aPfXKaxURBQAAgFQqz3sI+3c7pF2Lxk+Ne+SdN6ZVRCYAAABSoMyFcNiQwTePunfjxo2duxw8\n4sbrKiITAAAAKVCeL6bfscVOIYSGOzTKzMpMchwAAABSpcyFsHr17K/nfBFC+Gbu1xs3bKyASAAA\nAKRCmT9Upnp29hUXn5+ZlXXWicddf9vIisgEAABACpS5ELZus1vH/Q9s32n/atXKc7opAAAAlUSZ\nC+FDo0c9NHpU0dWZeUuSmgcAAIAUKXMh1AABAAB+G8pw2mffQw9IXLj9Jt82AQAAUOWV532A99x+\na9JzAAAAkGI+GAYAACBSCiEAAECkyvChMl/Omtk2t2HictEFnzEDAABQRZWhEOp+AAAAvyVOGQUA\nAIiUQggAABAphRAAACBSCiEAAECkFEIAAIBIKYQAAACRUggBAAAipRACAABESiEEAACIlEIIAAAQ\nKYUQAAAgUgohAABApBRCAACASCmEAAAAkVIIAQAAIqUQAgAAREohBAAAiJRCCAAAECmFEAAAIFIK\nIQAAQKQUQgAAgEgphAAAAJFSCAEAACKlEAIAAERKIQQAAIiUQggAABAphRAAACBSCiEAAECkFEIA\nAIBIKYQAAACRUggBAAAipRACAABESiEEAACIlEIIAAAQKYUQAAAgUgohAABApBRCAACASCmEAAAA\nkVIIAQAAIqUQAgAAREohBAAAiJRCCAAAECmFEAAAIFIKIQAAQKQUQgAAgEgphAAAAJFSCAEAACKl\nEAIAAERKIQQAAIiUQggAABAphRAAACBSCiEAAECkFEIAAIBIKYQAAACRUggBAAAipRACAABESiEE\nAACIVNIK4bKflp527FHH9DisT5fOb097LYSwKG/BCX179Du8ywl9eyxemFfKLQAAAKRG0grh2Hvv\n7nRgl8fGv3L9iJH/+Ps5IYTh11zRs2+/J16e0uvI/jdeNbSUWwAAAEiNjJl5S5Kyo7dfn7pLm912\naNT4+0ULjzzsoLdmfLlv29YvvfFeg223/Wnp0u6d27/9+ZzSbClx59Uzq7XYPmezjR3a75OU5OXw\n9rvvpWtpAACA0pv3w7KCwsIt3ZqVrGU6dD4ghPDxB+8PGzL4kquGhRBW5C/PqV8/hFAvJyd/+bJS\nbtmCjOyszGRF3XqVKgwAAMAWZYSwxT6YvEK4YcP6YZdd8tN//zv8zntatd41hFC3Xs6K/OU59Rsk\n/i3llhKt37hx1sIfkxV161WqMAAAAOWTtPcQPjT67sZNm424b0yiDYYQ2nfsNHXyxBDCtMkT23fs\nVMotAAAApEbS3kN4Ur8+8+d+XT07O3F10rvTFy/Mu3TQwJCREUK4YcTIxk2blWZL6Vcc0LtrUpKX\nw9jnJqRraQAAgGRJWiFMPYUQAABga/hiegAAgEgphAAAAJFSCAEAACKlEAIAAERKIQQAAIiUQggA\nABAphRAAACBSCiEAAECkFEIAAIBIKYQAAACRUggBAAAipRACAABESiEEAACIlEIIAAAQKYUQAAAg\nUgohAABApBRCAACASCmEAAAAkVIIAQAAIqUQAgAAREohBAAAiJRCCAAAECmFEAAAIFIKIQAAQKQU\nQgAAgEgphAAAAJFSCAEAACKlEAIAAERKIQQAAIiUQggAABAphRAAACBSCiEAAECkFEIAAIBIKYQA\nAACRUggBAAAipRACAABESiEEAACIlEIIAAAQKYUQAAAgUgohAABApBRCAACASCmEAAAAkVIIAQAA\nIqUQAgAAREohBAAAiFRWugNUeQN6d03X0mOfm5CupQEAgN8ARwgBAAAipRACAABESiEEAACIlEII\nAAAQKYUQAAAgUgohAABApBRCAACASCmEAAAAkVIIAQAAIqUQAgAAREohBAAAiJRCCAAAECmFEAAA\nIFIKIQAAQKQUQgAAgEgphAAAAJFSCAEAACKlEAIAAERKIQQAAIiUQggAABAphRAAACBSCiEAAECk\nFEIAAIBIKYQAAACRykp3AJJjQO+u6Vp67HMT0rU0AACwNRwhBAAAiJRCCAAAECmFEAAAIFIKIQAA\nQKQUQgAAgEgphAAAAJFSCAEAACKlEAIAAERKIQQAAIhUVroD8JsyoHfXdC099rkJ6VoaAACqKEcI\nAQAAIqUQAgAAREohBAAAiJT3EPIb5K2MAABQGo4QAgAAREohBAAAiJRTRqGiOHMVAIBKzhFCAACA\nSDlCCL9xDlQCALAljhACAABEyhFCIBUcqAQAqIQUQiAu6aqmeikAUAklrRAuylsw+Nwz16xeXbNW\nrZvuuLtJs9xk7Rngt6eS9FIx0rIuAFQeSXsP4fBrrujZt98TL0/pdWT/G68amqzdAgAAUEGSdoTw\n3TffuPz6f4YQDj68x63XD0vWbhMyQqielZncfW6N7MoRRoxNibEpMTYlxqbE2NRmMY7pfkhaYjz2\n4iQxthSDSqiSzEYliQFVwroNG3/h1qQVwhX5y3Pq1w8h1MvJyV++LFm7TcjKzNylUYPk7nNrVJIw\nYmxKjE2JsSkxNiXGpsTYVOWMsffee6clxocffijGlmKkixHdVOWcDTEqYYyELxb/t6CgcEsPyZiZ\ntyQpa+/btvWEt97Pqd9g2U9Lu3fe960ZXyZltwlZ1ao127ZuEncIAAAQgwX/zS8o3GIhTNoRwvYd\nO02dPLH3UUdPmzyxfcdOydptwoaCgm9/XJ7cfQIAAEQuaUcIFy/Mu3TQwJCREUK4YcTIxk2bJWW3\nAAAAVJCkFUIAAACqlqR97QQAAABVi0IIAAAQKYUQAAAgUgohAABApBRCAACASCmEAAAAkVIIAQAA\nIqUQAgAAREohBAAAiFTmwPMvTneGNLj1+mGtWrfeZpu6YiRiNNtxx5z6DdIbY9St/2zUpIkYlSrG\nA6PurFO37nbbNxRDjM08+uB9ISM0atxEjEoVY83aNc1yd0xvjKfGPbJq1UoxKlWMF5996ofvv2++\nU0sxxNjM5Akvzvv6q1a77CpGzDEiPUL44rNPLV+2LN0pKlGMH3/4Id0pwsQXnxejssV4bdKExQvz\n0p1CjMoY482pr343/5t0pxBj8xhz53yZ7hThvbfeEKOyxZj+/ntfzJyR7hRiVMYYMz7+eMbHH6c7\nhRhpjhFpIQQAAEAhBAAAiFRWugOUyvr161b//HMSd1hQULBqRX7+8jSfrlmJYqxakfYYG8WohDE2\nbPx51SoxxChuw4b1q3/+WYzKFmPN6tVpj7F+/ToxKluMdevWrlmzRgwxSoixdk0IQYzffIy69XIy\nMjK2dGvGzLwlSVysgrw59dXbb7ouiTtcvuynberWy8zMTOI+q3SMOtvUzcpK8/8dyF++vHadOmJU\nqhgr8vNr1qpZvXq2GGJsZuWKFdk1srOza4hRqWJUr169Rs2a6Y2xauXKrKwsMSpVjJ9XrapWrVrN\nWrXEEGMziSMutWrXFuO3HWPME8/VrlNnS7dWjUIIAABA0nkPIQAAQKQUQgAAgEgphAAAAJFSCAEA\nACKlEAIAAERKIQQAAIhUdIVwUd6CE/r26Hd4lxP69li8MC+VS8/+fMYfWzffUozUBHvuicd6H9Sp\n10Edn3vy36mPsXHDhh7771t0ddlPS0879qhjehzWp0vnt6e9lrI8m8VY8sN/zvjL0f27HXxcr8MX\n5S1ITYzyPfcUxEhI8ayWGCP1s7phw/qLBp5+dPdDjzi48/hnniwxWFpipGVEE8o6CSmIUb5gFREj\n9SO64Nv5e+/a4pD2fzik/R9G3HhtSNOIFo+RlhEt36IpiJGQ4hEtMUbqRzQxFYe0/8PB+/z+dzs1\nDWka0eIx0vVbtBw/ghTESEj9b9HiMVI/onPnfHlC3x7H9ux64pG9EpOQlhEtHiONf+gToiuEw6+5\nomfffk+8PKXXkf1vvGpoytb9cckPd906fM3q1VuKkZpg119+2f3/fuq+cU9ef/llKY7x0nPP9Dlk\n/2/mfl20Zey9d3c6sMtj41+5fsTIf/z9nBYaRzEAAAl9SURBVNTkKR7juqGXdu3Z5/GXJvc99vgR\nN16Xmhjle+4piBHSMaslxkj9rL419bXatev8+8WJI8c8MuyyS0oMlpYYaRnRUK5JSEGM8gWriBip\nH9H58+b+5ZTTJ707fdK70wcNviykaUSLx0jLiJZv0RTECOkY0RJjpH5EE1Mx6d3ppw0898KhV4Y0\njWjxGOn6LVqOH0EKYoQ0/RYtHiP1I3rpeQPPHHTBuBcmnHLm2cOvuSKkaUSLx0jXiBaJrhC+++Yb\nh3TrGUI4+PAe77/9VmoWXbt27ZWDL/jH1df9QozUBMtpUH9R3oLFC/Ny6tdPcYzDevR6dtK0Tbfs\n3b7D4b2PCCE02Hbb9evXpyZP8Rhvvz515mef/L5VswfuumP/LoekJkb5nnsKYqRlVovHCOmY1ba/\n+5+zzr+ooKBg8cKF9XLqlRgsLTHSMqLlm4QUxEjLiBZfNKRjRL/9Zt5rEyf8vlWzIw7uPPvzGSFN\nI1o8RlpGtHyLpiBGWka0eIyQvr/4C76dP23yxONPPi2kaUSLx0jLiIZy/QhSECNd/1G6WYziW1IQ\n44tZM/fed78QQvtO+7/31pshTSNaPEa6RrRIdIVwRf7yxNjVy8nJX74sNYteN+SSE//6tybNcn8h\nRmqCXXz5Ncf27Hpcr8MvvvzqFMfIzMzMzMradEuHzgfs0Kjxxx+8P/Dkv1xy1bDU5CkeY9XKlc12\nbP7u7HnHnXzamNGjUhOjfM89BTHSMqvFY4R0zOr2DXdo1KTp2Sf/ZcCRPc++8JISg6UlRlpGtHyT\nkIIYaRnR4ouGdIxo/QYNTh14zntffNPrqKMvv+i8kKYRLR4jLSNavkVTECMtI1o8RkjfX/wbrxp6\n3iVDqlWrFtI0osVjpGVEQ7l+BCmIka7/KN0sRvEtKYjRZvc9nhr38Lp16x6+/57ly34KaRrR4jHS\nNaJFoiuEdevlrMhfHv73ZW2QmkVfGf/8Sf36tM1tGEJom9tw7dq1xWOkJthNVw995NmXHn7mxRuu\nHFLioql8fTZsWH/l4AvG3HPX8Dvv6dm3X7ry1Mupf/wpf61Ro0a/4wd8OWtmamKU77mnIEZaZrV4\njJCOWd24cWNBQcHIMY/86+nxw6++vMRgaYmRlhEt3ySkIEZaRrT4oiEdI9rjiKN6Hdk/Ozv7uJNP\nmzN7dkjTiBaPkZYRLd+iKYiRlhEtHiOk6S/+rBmfbVi/vs0ebRNX0/WHfrMYaRnRUK4fQQpipOs/\nSjeLUXxLCmJc888Rjz304J/btPx+8eK69XJCmka0eIx0jWiR6Aph+46dpk6eGEKYNnli+46dUrPo\nu7PnzsxbMjNvSQhhZt6SGjVqFI+RmmDLf1rWrHnzZs2br8jPL3HRVL4+D42+u3HTZiPuG9Oq9a6J\nLWnJs89+Hd95fWoI4YN33kr88UhBjPI99xTESMusFo8R0jGrD4y644FRd2RkZDRq0rSgoLDEYGmJ\nkZYRLd8kpCBGWka0+KIhHSN62fnnvPfWGyGEzz7+qM0ee4Q0jWjxGGkZ0fItmoIYaRnR4jFCmv7i\nP/HI2F5H9i+6mq4/9JvFSMuIhnL9CFIQI13/UbpZjOJbUhDj6zlfjBzz8CffLNxjz3YdDzgwpGlE\ni8dI14gWyUgMRDwWL8y7dNDAkJERQrhhxMjGTZulcvW2uQ0TL3jxGKkJ9twTj90/6s7CwsLTBp7b\n+6ijUx+j6BUIIZzUr8/8uV9Xz85OXJ307vSU5dk0xqK8BRef87d1a9dWz86++qZbd961TQpilO+5\npyBG0U2pnNUSY6R+Vn9auvSigafnL1+2ft26M/9+0aHde6ZlRIvHSMuIFinTJKQgRvmCVUSM1I/o\nN3O/HnLBuRvWb8iqnnXljTe3brN7Wka0eIy0jGj5Fk1BjKKbUjmiJcZI/YgWFhYetHe7pyZO3Xa7\n7RJb0jKixWOk67doOX4EKYhRtD3Fv0WLx0j9iL79+tTh11xRvXr2ttttN+yW27dvuENaRrR4jPT+\noQ8RFkIAAAASojtlFAAAgASFEAAAIFIKIQDxunvELW9OfXXMPXe9Mv75X7jbOacOOLZn1/8sXlTK\n3Y69757EhcRH+QFApaUQAhCvL2bO2K3tnnNmz9p19z1+4W6vvvLyI8+91KhJ01Lu9sb/+1z16XPz\ntjYiAFQkhRCASA0+58xXX3l5/9+3feXF5+++7eai7Z9/+nH/bocc1+vwo7sfOvOzTxIbB/31pM2O\nELbNbXjT1Ze3zW3Y+6BOx/fu1vugTjdeNTSEcM6pA0IIiTv/YefcxD1vvvaqIw7ufNUlF4YQ5nwx\n+9ieXY/v0+36y/+ROIT42NgH+3c75C9HdO976AH333VHip4/APiUUQCi9Z/Fi4ZeeN6t9zww+Nwz\n73zgX0Xb+3TpPPjKa/bttP/b014bPuzKZyZNK/69FyGEtrkN733k8Tenvda9z5Ft2/1u+bKfOuy5\na+JuRfdPXGib2/CZSdOa7dj8z7u1nJm3pH+3g8+7ZEiHzge888a00449ambekj+2bv7vFyfusutu\nq3/++ZEH7z1t4KBUvg4AxCwr3QEAIA1eePqJS849K4Tw591ahhDOO/3k20Y/mLhp3tdz/rRvhxDC\nnzrsN/erOb+wk/ad9l+9evVDo0fVq1+/etYv/UltvdvuGRkZictfzJr55w77hRD2br9vYsvt9z10\n7x0jVq5cUb169U2/UBsAKppCCECMevbtt/THHxs1aboif3lmZlbfY44ruqnlzq0/fPed9h07f/DO\n2zvv0voXdlKtWrWLz/nbpHemb7v99gu+nV/0WTLFFbXBEEKrnVt/8M7b+3baf/r77yW2vDZxwtX/\nvK1GjRp5333bu0unj776bqufHwCUikIIQKS+nDWzS9fuD95959EDTt50+7BbRlzzj4uzsqpv3Lhx\n2C23//JOTvzr30477qgdGjVu1XrX1m12H3PPXSedcVaLlq0WL8xr0iy3xIdce+vtVw6+ILtGjXb/\n84fMrKwQQvOWrY7pfkidunXXrV174ZArk/T8AODXeQ8hAKTUHf+8od3v/9hx/wNffuHZ0Xfc9vyr\nb6Y7EQDxcoQQAFKq0wFdLjv/nIHfzNup1c5X3Hjzrz8AACqMI4QAAACR8j2EAAAAkVIIAQAAIqUQ\nAgAAREohBAAAiJRCCAAAECmFEAAAIFIKIQAAQKQUQgAAgEgphAAAAJFSCAEAACKlEAIAAERKIQQA\nAIiUQggAABCp/wf7XPe/JSwuVAAAAABJRU5ErkJggg==\n"
}
],
"prompt_number": 6
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# Ratings histogram\n",
"ratings_kv = ratings.map( lambda(user, movie, rating, ts): (rating, 1) )\n",
"ratings_hist = ratings_kv.reduceByKey(lambda x,y: x + y)\n",
"\n",
"histogram_per_rating_df = pd.DataFrame(ratings_hist.collect(), columns = ['rating', 'frequency'])\n",
"histogram_per_rating_df.sort(inplace=True, columns = 'rating')\n",
"histogram_per_rating_df.describe()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>rating</th>\n",
" <th>frequency</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td> 10.000000</td>\n",
" <td> 10.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mean</th>\n",
" <td> 2.750000</td>\n",
" <td> 1000005.400000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
" <td> 1.513825</td>\n",
" <td> 957962.452631</td>\n",
" </tr>\n",
" <tr>\n",
" <th>min</th>\n",
" <td> 0.500000</td>\n",
" <td> 94988.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25%</th>\n",
" <td> 1.625000</td>\n",
" <td> 373678.500000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50%</th>\n",
" <td> 2.750000</td>\n",
" <td> 687664.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75%</th>\n",
" <td> 3.875000</td>\n",
" <td> 1378550.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>max</th>\n",
" <td> 5.000000</td>\n",
" <td> 2875850.000000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>8 rows \u00d7 2 columns</p>\n",
"</div>"
],
"metadata": {},
"output_type": "pyout",
"prompt_number": 7,
"text": [
" rating frequency\n",
"count 10.000000 10.000000\n",
"mean 2.750000 1000005.400000\n",
"std 1.513825 957962.452631\n",
"min 0.500000 94988.000000\n",
"25% 1.625000 373678.500000\n",
"50% 2.750000 687664.000000\n",
"75% 3.875000 1378550.000000\n",
"max 5.000000 2875850.000000\n",
"\n",
"[8 rows x 2 columns]"
]
}
],
"prompt_number": 7
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"%%R -w 1200 -i histogram_per_rating_df\n",
"p = ggplot(histogram_per_rating_df, aes(x = factor(rating), y = frequency)) + theme_economist() +\n",
" geom_bar(stat = 'identity') + \n",
" labs(title = '# rating histogram', x = 'rating', y = 'Frequency')\n",
"print(p)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "display_data",
"png": "iVBORw0KGgoAAAANSUhEUgAABLAAAAHgCAMAAACCSWStAAAC8VBMVEUAAAABAQECAgIDAwMDBAQE\nBAUFBQYGBgYICAgICQkJCgoKCwsLDAwMDQ0NDQ4NDg8ODxAPEBEQERIREhISExMSFBQVFhcWFxgX\nGBkXGRoYGhsZGxwaHBwaHB0bHR0cHh4cHh8dHyAeICEfISIgIiMhIyQiJCUiJSYjJSYjJickJick\nJiglJykmKCkmKSkmKSonKisoKywpLC0qLS4rLi8rLjAsLzEtMDIuMTMvMjQwMzUwNDUxNTYyNjcz\nMzM1ODo1OTs4PD45PT86PkA6P0E7P0A7P0E8QEI9QUM+QkQ/Q0U/REZBRkhCR0lDSEpESEtESUxF\nSkxGS01HTE5ITU9JTlBKT1FLUFNMUVRNUlVOU1ZPVFdPVVhQVlhRV1lSWFpTWVtUWVxVW15WXF9Y\nXmFZX2JZYGNaYWRcYmVdY2ZeZGdeZWhfZmlgZ2piaWxjam5ka29lbHBmbXBnbnFob3JocHNpcXRr\ncnZsc3dtdHhtdXlweHtxeXxyen1yen5ze390fIB1fYF2foJ3f4N3gIR4gYV5goZ6g4d7g4d8hIh9\nhop+h4t/iIyAiY2Bi4+Ci5CEjZKFjpOGj5OGkJSHkZWIkpaJk5eKlJiLlJmLlZqMlpuPmZ6Qmp+R\nnKCSnKGUnqOVn6SWoKWWoaaXoqeYo6iZpKmapaqbpaqbpqucp6yeqa6fqq+gq7CgrLGhrbKirbOj\nrrSkr7Skr7WlsLamsrens7iotLmqtruqtryrt72suL6tub+uusCvu8GvvMKwvMGwvcKxvsOyvsSz\nv8W0wMa0wce2w8m4xcu5xsy5xs26x867yM68yc+9ytC+y9G+zNK/zdPAztTBz9XD0NfD0djE0tnF\n09nG1NrH1dvI1tzI193J197K2N/L2eDM2uHN2+LN3OPO3eTP3uXQ3+XR4ObS4OfS4ejT4unU4+rV\n5OvW5OvW5evW5ezX5ezX5uzY5uzZ5u3a5+3a5+7b6O7g6/Dj7fLr8vbu9Pf0+Pr5+/z///9dTxJt\nAAAbDUlEQVR4nO3de3yeZX3H8TtNK3gAmg7xsBVqS3UKVaBJsaYtGds8sDlHRGTuoILu0OFhY3Ob\nigKbzAMtCIqn4YSJVJTBwG3aTlYR1o2kCNW01FFnStvkcoobO/PXnidPEsidxLZP7uv7u6/f9Xm/\nXtQ0PM2V353r93m1BWoRACARhfUnAACHimABSAbBApAMggUgGQQLQDIIFoBkECwAySBYAJJBsAAk\ng2ABSAbBApAMggUgGQQLQDIIFoBkECwAyZhzsL70hsGvvuHuGf7G1q37Gx++GJrt4Im/88SXjP0Y\nAJjZnIP1ex17Li3un+kjN0O0ePGu2Q6e6NQTXzJ73wBg7sE668Rw3tNHZ/rIzfgMDY3MdvBEm574\nEoIF4EeYY7DeUrTc23h7qCj+etkl33z9cZ1PO3Pz/uZ7v9QsUOPdnz5h3mlfDmH0vT/W9amJJhXF\nny3vfMnXWpG67zVd8571GztbP2b0fSfMO+F9o4+/fsoHbh5z3bJ5p3zpdfOXfG7u8wNIyByDdfGS\n4tilRdfyb4SxYD27uOSni0WvOKl49sPLi+I5W8aDdeSLnlacOBquLOZ3P/XxYB254sji9LFgja4s\nXvSLzype3voxf1occeYRxfvD5OunfOCRxveedMpTiwXHPLN4+mw/fQPg0lx/SXhdcdsXij8fe7NR\nkvWDD5179lfCjqLY1frlXSs3nwt3FcXusLz4cKNGk8H6WLit6Bx7ybcbP7Eavf9X3jD++16Nv3Nt\nsSRMvn7qB25874vh74ria42fkO2c42cPIClzDNZpXcWyY4slFzffbpTkOyE8fNUvrx77adQTgjUa\nRhtv7CmKPWH348H6VvhWUbR+hnV6URz76g3fGfsx/9z8MA81Xjv5+qkfeOzjjX3D73gBmZljsJ7S\n+i2stzTfHmrmJ7y6OOPym0vBCmNvNCL0L2HP48Eaav2I5hsPf+ZNS4qib+ztPc08Nb+ZfP3UDzz2\nvckfCSAjcwzWaNcvhRN/rvV2qysLin9s/FJvLFjfmhqs0WcU14aNMwXruxdcHEZvLha0fszx478k\nnHz91A9MsIBszTFY24s/fqh4R+vtVleWFie89GnN8BxZ/MK2KcEKf1Qs6FkwU7BGTy5W9y8tXtH6\nMR8Y+033D4XJ10/9wAQLyNYcg/WZ4ou3Fp9uvd3qyh0r5p1y08Liw+Hipz5p89Rg7X/LMUdfOeMv\nCf+pf1FH1+uGWj9m9PIT5i/5wGiYfP3UD0ywgGzNMVh7dx94ePch/uc0w29725fDHUXn3kP82If7\negDeCf/j59GeYv6SecVrYr0egHezBmvfuatWXDvlPQNrV64dDA+8rHv1QHtnPXDuMfOeuX5PtNcD\ncG7WYF3/xjBwzOT3djf+6r8ibDg79F8VNp6n+MwAoGTWYO24b+TW48Pdfb1rtoy/bOFQGOoKR735\niGUfn3jR9/8TACL73sGDFcJZxUfCmq1hW8/4yzpGwoGO0HHJ8OU9Ey959DEAiOyRgwfrwMjobceG\n+UVRdA6P/dvswwt3hZ2LQtfeMLxg4kU//B8AiOz7Bw/Wu94d7l0Ult4Tdmwcf1n/NeHqc0L/9eHG\nntl+EABENGuwhs7sXvGpcPuq3lWbGt/rbPw1eEZf3/YwsKa79y7d5wcAk/g/oQCQDIIFIBkEC0Ay\nCBaAZBAsAMkgWACSQbAAJINgAUgGwQKQDIIFIBkEC0AyCBaAZBAsAMkgWEDV1sVnPaIVggVUjWBF\nQ7CAqhGsaAgWUDWCFQ3BAqpGsKIhWEDVCFY0BAuoGsGKhmABVSNY0RAsoGoEKxqCBVSNYEVDsICq\nEaxoCBZQNYIVDcECqkawoiFYQNUIVjQEC6gawYqGYAFVI1jRECygagQrGoIFVI1gRUOwgKoRrGgI\nFlA1ghUNwQKqRrCiIVhA1QhWNAQLqBrBioZgAVUjWNEQLKBqBCsaggVUjWBFQ7CAqhGsaAgWUDWC\nFQ3BAqpGsKIhWEDVCFY0BAuoGsGKhmABVSNY0RAsoGoEKxqCBVSNYEVDsICqEaxoCBZQNYIVDcEC\nqkawoiFYQNUIVjQEC6gawYqGYAFVI1jRECygagQrGoIFVI1gRUOwgKoRrGgIFlA1ghUNwQKqRrCi\nIVhA1QhWNAQLqBrBioZgAVUjWNEQLKBqBCsaggVUjWBFQ7CAqhGsaAgWUDWCFQ3BAqpGsKIhWEDV\nCFY0BAuoGsGKhmABVSNY0RAsoGoEKxqCBVSNYEVDsICqEaxoCBZQNYIVDcECqkawoiFYQNUIVjQE\nC6gawYqGYAFVI1jRECygagQrGoIFVI1gRUOwgKoRrGgIFiwIVtpwp10PZ4tgwYJgpQmWRwQLFgQr\nTbA8IliwIFhpguURwYIFwUoTLI8IFiwIVppgeUSwYEGw0gTLI4IFC4KVJlgeESxYEKw0wfKIYMGC\nYKUJlkcECxYEK02wPCJYsCBYaYLlEcGCBcFKEyyPCBYsCFaaYHlEsGBBsNIEyyOCBQuClSZYHhEs\nWBCsNMHyiGDBgmClCZZHBAsWBCtNsDwiWLAgWGmC5RHBggXBShMsjwgWLAhWmmB5RLBgQbDSBMsj\nggULgpUmWB4RLFgQrDTB8ohgwYJgpQmWRwQLFgQrTbA8IliwIFhpguURwYIFwUoTLI8IFiwIVppg\neUSwYEGw0gTLI4IFC4KVJlgeESxYEKw0wfKIYMGCYKUJlkcECxYEK02wPCJYsCBYaYLlEcGCBcFK\nEyyPCBYsCFaaYHlEsGBBsNIEyyOCBQuClSZYHhEsWBCsNMHyiGDBgmClCZZHBAsWBCtNsDyaNVg7\nzzz95JumvGdg7cq1g+GBl3WvHoj+acE5wUoTLI9mDdZFl4XNx01+b3fjr/4rwoazQ/9VYeN5is8M\nnglWmmB5NGuwNt0fti8Kd/f1rtky/rKFQ2GoKxz15iOWfXziRf/+f0A7BCu9zvV0dsNZ+MHBgxXC\n7ad+NKzZGrb1jL+sYyQc6Agdlwxf3jPxkkcfA9ohWOl1rqezG87CIwcP1r7zX/X1EOYXRdE5XDQN\nL9wVdi4KXXvD8IKD/bwN+NEEK80vCT2aNVjveWfz26X3hB0bx1/Wf024+pzQf324sWe2HwQcGsFK\nEyyPZg1W33GLFy8Ot6/qXbWp8b3Oxl+DZ/T1bQ8Da7p779J9fvBJsNIEyyP+PSxYEKw0wfKIYMGC\nYKUJlkcECxYEK02wPCJYsCBYaYLlEcGCBcFKEyyPCBYsCFaaYHlEsGBBsNIEyyOCBQuClSZYHhEs\nWBCsNMHyiGDBgmClCZZHBAsWBCtNsDwiWLAgWGmC5RHBggXBShMsjwgWLAhWmmB5RLBgQbDSBMsj\nggULgpUmWB4RLFgQrDTB8ohgwYJgpQmWRwQLFgQrTbA8IliwIFhpguURwYIFwUoTLI8IFiwIVppg\neUSwYEGw0gTLI4IFC4KVJlgeESxYEKw0wfKIYMGCYKUJlkcECxYEK02wPCJYsCBYaYLlEcGCBcFK\nEyyPCBYsCFaaYHlEsGBBsNIEyyOCBQuClSZYHhEsWBCsNMHyiGDBgmClCZZHBAsWBCtNsDwiWLAg\nWGmC5RHBggXBShMsjwgWLAhWmmB5RLBgQbDSBMsjggULgpUmWB4RLFgQrDTB8ohgwYJgpQmWRwQL\nFgQrTbA8IliwIFhpguXRtGD9wb0WnwYyI1hpguXRtGBtWP1TH/uuxWeCnAhWmmB5NMMvCbesOPqC\nf9B/JsiJYKUJlkfTgnXrBae8bcuHnmHxuSAfgpUmWB5NC9av3TwSwoHLLD4X5EOw0gTLo2nB2vUX\n4aO7LT4T5ESw0gTLo2nBeunV4YMvtfhMkBPBShMsj6YF6yWNv1YbfCLIimClCZZH04L14i/s+8se\ni88EORGsNMHyaFqwNp/WedLfWHwmyIlgpQmWR/ynObAgWGmC5dG0YJ1fNFh8JsiJYKUJlsfhprVp\nxZ6IswItgluf7U67Hm5asN5652jEYYExgluf7U67Hm5asAp+SYj4BLc+2512PRxtggXBrc92p10P\nR7BgQXDrs91p18NN//ewejqv/Hy8aYEmwa3PdqddDzctWL3bigdOjzguEAgWw7U53PRgNd7VG3Fc\nIBAshmtzuGnB6vtacQ8/w0Jkgluf7U67Hm5asO7unb/0jojjAoFgMVybw/FPCWFBcOuz3WnXw/Ev\njsKC4NZnu9Ouh5uhTaNfXR9tWGCM4NZnu9Ouh5vpJ1N7VsSaFWgR3Ppsd9r1cDP+kvD8iOMCgWAx\nXJvD8dtVsCC49dnutOvhCBYsCG59tjvteriZ/ykh/5wQcQlufbY77Xq4aWW64LN7b3x9xHGBQLAY\nrs3hpgWr+Z/l8H/zhcgEtz7bnXY93LRgnXbDvhtOjjguEAgWw7U53LRgfeXUzufx3xIiMsGtz3an\nXQ/H767DguDWZ7vTrofjTxyFBcGtz3anXQ/HnzgKC4Jbn+1Oux6OP3EUFgS3Ptuddj0cf+IoLAhu\nfbY77Xq4acH6Gf7EUcQnuPXZ7rTr4aYF68KbRiIOC4wR3Ppsd9r1cPyJo7AguPXZ7rTr4WgTLAhu\nfbY77Xq4UrBOCeGiqNMCTYJbn+1Oux5uerD4ORfiE9z6bHfa9XAECxYEtz7bnXY9HMGCBcGtz3an\nXQ9XyhN/3igkBLc+2512PRxtggXBrc92p10PR7BgQXDrs91p18MRLFgQ3Ppsd9r1cAQLFgS3Ptud\ndj0cwYIFwa3PdqddD0ewYEFw67PdadfDESxYENz6bHfa9XAECxYEtz7bnXY9HMGCBcGtz3anXQ9H\nsGBBcOuz3WnXwxEsWBDc+mx32vVwBAsWBLc+2512PRzBggXBrc92p10PR7BgQXDrs91p18MRLFgQ\n3Ppsd9r1cAQLFgS3Ptuddj0cwYIFwa3PdqddD0ewYEFw67PdadfDESxYENz6bHfa9XAECxYEtz7b\nnXY9HMGCBcGtz3anXQ9HsGBBcOuz3WnXwxEsWBDc+mx32vVwBAsWBLc+2512PRzBggXBrc92p10P\nR7BgQXDrs91p18MRLFgQ3Ppsd9r1cAQLFgS3Ptuddj0cwYIFwa3PdqddD0ewYEFw67PdadfDzR6s\n/ctL7xhYu3LtYHjgZd2rByoZHBkT3Ppsd9r1cLMG6xPPe+Lf2t34q/+KsOHs0H9V2HheZcMjU4Jb\nn+1Oux5u1mAd2N/8W3f39a7ZMv6yhUNhqCsc9eYjln184kU//G+gHYJbv871dHkN9/2DB6v1t9Zs\nDdt6xt/uGAkHOkLHJcOX90y85NHHgHYIbv0619PlNdwjhxqs+UVRdA4XTcMLd4Wdi0LX3jC8YOIl\nP/gvoB2CW7/O9XR5DfevhxqspfeEHRvH3+6/Jlx9Tui/PtzYM/sPAg6F4NZn+9s8roc7SLBuX9W7\nalPjfzsbfw2e0de3PQys6e69q9ongPwIbn22O+16OP49LFgQ3Ppsd9r1cAQLFgS3Ptuddj0cwYIF\nwa3PdqddD0ewYEFw67PdadfDESxYENz6bHfa9XAECxYEtz7bnXY9HMGCBcGtz3anXQ9HsGBBcOuz\n3WnXwxEsWBDc+mx32vVwBAsWBLc+2512PRzBggXBrc92p10PR7BgQXDrs91p18MRLFgQ3Pp1ro9T\nf8HqMhzBggXBrSdYcdgOR7BgQXDrCVYctsMRLFgQ3HqCFYftcAQLFgS3nmDFYTscwYIFwa0nWHHY\nDkewYEFw6wlWHLbDESxYENx6ghWH7XAECxYEt55gxWE7HMGCBcGtJ1hx2A5HsGBBcOsJVhy2wxEs\nWBDceoIVh+1wBKu2BBfD7tqLh/N3nPoLVpfhCFZtCS4GwUr2OPUXrC7DEazaElwMgpXsceovWF2G\nI1i1JbgYBCvZ49RfsLoMR7BqS3AxCFayx6m/YHUZjmDVluBiEKxkj1N/weoyHMGqLcHFIFjJHqf+\ngtVlOIJVW4KLQbCSPU79BavLcASrtgQXg2Ale5z6C1aX4QhWbQkuBsFK9jj1F6wuwxGs2hJcDIKV\n7HHqL1hdhiNYtSW4GAQr2ePUX7C6DEewaktwMQhWssepv2B1GY5g1ZbgYhCsZI9Tf8HqMhzBqi3B\nxSBYyR6n/oLVZTiCVVuCi0Gwkj1O/QWry3AEq7YEF4NgJXuc+gtWl+EIVm0JLgbBSvY49ResLsMR\nrNoSXAyClexx6i9YXYYjWLUluBgEK9nj1F+wugxHsGpLcDEIVrLHqb9gdRmOYNWW4GIQrGSPU3/B\n6jIcwaotwcUgWMkep/6C1WU4glVbgotBsJI9Tv0Fq8twBKu2BBeDYCV7nPoLVpfhCFZtCS4GwUr2\nOPUXrC7DEazaElwMgpXscbkMV0awaktwMQhWssflMlwZwaotwcUgWMkel8twZQSrtgQXg2Ale1wu\nw5URrNoSXAyClexxuQxXRrBqS3AxCFayx+UyXBnBqi3BxSBYyR6Xy3BlBKu2BBeDYCV7XC7DlRGs\n2hJcDIKV7HG5DFdGsGpLcDEIVrLH5TJcGcGqLcHFIFjJHpfLcGUEq7YEF4NgJXtcLsOVEazaElwM\ngpXscbkMV0awaktwMQhWssflMlwZwaotwcUgWMkel8twZQSrtgQXg2Ale1wuw5URrNoSXAyClexx\nuQxXRrBqS3AxCFayx+UyXBnBqi3BxSBYyR6Xy3BlBKu2BBeDYCV7XC7DlRGs2hJcDIKV7HG5DFdG\nsGpLcDEIVrLH5TJcGcGqLcHFIFjJHpfLcGUEq7YEF4NgJXtcLsOVEazaElwMgpXscbkMV0awaktw\nMQhWssflMlxZ2sGyfXbpD0ewkj0ul+HKCFb7zy794QhWssflMlwZwWr/2aU/HEuW7HG5DFdGsNp/\ndukPx5Ile1wuw5URrPafXfrDsWTJHpfLcGUEq/1nl/5wLFmyx+UyXBnBav/ZpT8cS5bscbkMV0aw\n2n926Q/HkiV7XC7DlRGs9p9d+sOxZMkel8twZQSr/WeX/nAsWbLH5TJcGcFq/9mlPxxLluxxuQxX\nRrDaf3bpD8eSJXtcLsOVEaz2n136w7FkyR6Xy3BlBKv9Z5f+cCxZssflMlwZwWr/2aU/HEuW7HG5\nDFdGsNp/dukPx5Ile1wuw5URrPafXfrDsWTJHpfLcGUEq/1nl/5wLFmyx+UyXBnBav/ZpT8cS5bs\ncbkMV0aw2n926Q/HkiV7XC7DlRGs9p9d+sOxZMkel8twZQSr/WeX/nAsWbLH5TJcGcFq/9mlPxxL\nluxxuQxXRrDaf3bpD8eSJXtcLsOVEaz2n136w7FkyR6Xy3BlBKv9Z5f+cCxZssflMlwZwWr/2aU/\nHEuW7HG5DFdGsNp/dukPx5Ile1wuw5URrPafXfrDsWTJHpfLcGUEq/1nl/5wLFmyx+UyXBnBav/Z\npT8cS5bscbkMV0aw2n926Q/HkiV7XC7DlRGs9p9d+sOxZMkel8twZQSr/WeX/nAsWbLH5TJcGcE6\n9GcnOM33ca6H41lGOa2MYB36sxOc5vs418PxLKOcVkawDv3ZCU7zfZzr4XiWUU4rqzhYgmG4GMke\n53o4nmWU08oIVq1O832c6+F4llFOKyNYtTrN93Guh+NZRjmtjGDV6jTfx7kejmcZ5bQyglWr03wf\n53o4nmWU08oIVq1O832c6+F4llFOKyNYtTrN93Guh+NZRjmtjGDV6jTfx7kejmcZ5bQyglWr03wf\n53o4nmWU08pmDdbA2pVrB2d4z/T3P5FgGC5Gsse5Ho5nGeW0slmD1X9F2HD25Pd2T75n6vvLBMNw\nMZI9zvVwPMsop5XNGqyFQ2GoK9zd17tmy/jLWu9pfTvue/9RIhhmnfY418PxLJM9LpfhWg4hWB0j\n4UBHWLM1bOsZf1nrPa1vxz36WIlgmHXa41wPx7NM9rhchmt55BB+hrUr7FwU5hdF0TlcNA233tP6\ndty//S8ARPaDgwer/5pw9Tlh6T1hx8bxl7Xe0/oWAPRmDdbgGX1928Ptq3pXbWp8r3PyPa1vAUAv\n7T/AD0BWCBaAZBAsAMkgWACSQbAAJINgAUgGwQKQDIIFIBkEC0AyCBaAZFgH67rrpMe9/RvK0+78\nQ+Vp4cpbpMed/13labe8X3laeO+dytN2/ZbytHDjx6THVbp01sF6+9ulx/249B7esEJ5WnjtB6XH\nFd9WnvbBs5SnhbU3KE8bPFp5Wnj3m6THVbp0BCsiglUdglUdgtU+glUdglUdglWdGgXr4Qfnav36\nOX+Iw/Gsv1Ke9snnK097sP9S6XHFgPK0S39WedqDqz+pPO3vj1Ke9uA7Xi89roKlG60oWDd2z9Vz\nnzvnD3E4TjhVedoLn6M8rfvEF0iP+4mVytNesEx5WvfSFypPO+145WndP7lcelwFS7enomABgBDB\nApAMggUgGQQLQDIIFoBkECwAySBYAJJhF6yBtSvXDjbf2PaUxYt/V3Pm/uWacx4/STfczjNPP/km\n7Um64fadu2rFtdqThNcybH6y6KDJo3TTVXySXbD6rwgbzm6+8dnfUR35ieepxp08STfcRZeFzcdp\nT9INd/0bw8Ax2pN0w4Udr5St4cRRuukqPskuWAuHwlBX843LTzpixWbJkQf2q8adPEk33Kb7w/ZF\n2pN0w+24b+TW47Un6YYb/vlB1b2cPEo3XcUn2QWrYyQc6Gi+ce1H9l6yUnSobtzxk5TD3X7qR7Un\nKYc7q/iI9iTdcG/8ouxeTh6lm67ikwx/hrUr7Jz4GcHwAtGh8mAF2XD7zn/V1yUHTTlJNNyBkdHb\njpWfJBru6KJhWH6UbOkqPcnw97CuCVefE8J3wq/eHG7pFh0qDpZyuPe8U3LM4ycph3vXu8O9ml/v\njp+kvZbKNWwepZyu4pPsgjV4Rl/f9hCWh3t6V/VuFR0qDpZyuL7jFi9erDxJOdzQmd0rPqU8SXst\n1cFSTlfxSfx7WACSQbAAJINgAUgGwYLSn3DlMBfcHig17pvmH+DDJ4IFgeLC4qSXnPTb4ZXFfY0r\nV7x1xQXhzhf3/ia3D4eJKwOB4qYLvxx2FWP3rRGsrz5UhO5N4fPcPhwmrgwEipFPv/bX108Ga7Tx\nzfx94WFuHw4TVwYCRXjyN8O2yWA1vzn58+Fmbh8OE1cGAkW46JSXX3jypWHZ4ESw/nZl7/r51p8X\nUkOwYOP3b9h/7fOtPwmkhmDBxh0ndiy/xfqTQGoIFoBkECwAySBYAJJBsAAkg2ABSAbBApAMggUg\nGQQLQDIIFoBkECwAySBYAJJBsAAkg2ABSAbBApCM/wc5Oed4RbCG2gAAAABJRU5ErkJggg==\n"
}
],
"prompt_number": 8
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Collaborative filtering\n",
"\n",
"In its most basic form:\n",
"\n",
"- Take all pairs of movies that have a users in common that rated those movies\n",
"- For each pair that have users in common:\n",
" - build up a vector of ratings for each movie containing the ratings from users that are in common\n",
" - compute the similarity of those vectors according to some measure of similarity\n",
"- For each movie, find all other movies that have users in common\n",
"- The take the top-N similar movies and use those as recommendations\n",
"\n",
"## Cosine similarity\n",
"\n",
"$$\\text{similarity} = \\cos(\\theta) = {A \\cdot B \\over \\|A\\| \\|B\\|} = \\frac{ \\sum\\limits_{i=1}^{n}{A_i \\times B_i} }{ \\sqrt{\\sum\\limits_{i=1}^{n}{(A_i)^2}} \\times \\sqrt{\\sum\\limits_{i=1}^{n}{(B_i)^2}} }$$\n",
"\n",
"## Our approach\n",
"- Group ratings by user\n",
"- For each user, take only N recent most ratings\n",
"- Normalize ratings for each user (divide by max rating per user)\n",
"- For each user, generate pairs of movies from all permutations that exist for a user together with a pair of ratings\n",
"- Group by movie pair and collect rating vectors for each movie pair (in a distributed way)\n",
"- Collect similar movies for each movie\n",
"- Keep only top N\n",
"- (Also, filter movie pairs that have less than 5 users in common)\n"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"from itertools import combinations\n",
"from math import sqrt\n",
"from functools import partial"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 9
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# Sample X% of users\n",
"sample_size = 1.0 # == fraction of number of users, not ratings\n",
"\n",
"ratings_by_user = ratings.\\\n",
" keyBy(lambda (user, movie, rating, ts): user)\\\n",
" .groupByKey()\n",
"\n",
"# Keep only N recent most ratings for each user\n",
"def filter_old( (user, ratings), num = 10000 ):\n",
" recent_most = sorted(ratings, key = lambda (user, movie, rating, ts): ts, reverse = True)[:num]\n",
" return (user, recent_most)\n",
"\n",
"recent_by_user = ratings_by_user\\\n",
" .map(partial(filter_old, num=200))\n",
"\n",
"if sample_size < 1.0:\n",
" final_ratings = recent_by_user.sample(False, sample_size, 137)\n",
"else:\n",
" final_ratings = recent_by_user\n",
"\n",
"final_ratings.count()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 10,
"text": [
"69878"
]
}
],
"prompt_number": 10
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"def norm_ratings_per_user_and_remove_ts( (user, ratings) ):\n",
" mx = max([r for u, m, r, ts in ratings])\n",
" return (user, [ (movie, rating / mx) for dummy, movie, rating, ts in ratings ])\n",
"\n",
"normed_ratings_by_user = final_ratings.map(norm_ratings_per_user_and_remove_ts)\n",
"\n",
"normed_ratings_by_user.take(1)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 11,
"text": [
"[(38912,\n",
" [(1573, 0.6),\n",
" (788, 0.6),\n",
" (1500, 0.6),\n",
" (991, 0.8),\n",
" (805, 1.0),\n",
" (1527, 0.6),\n",
" (745, 1.0),\n",
" (802, 0.6),\n",
" (296, 0.8),\n",
" (62, 1.0),\n",
" (832, 0.8),\n",
" (318, 1.0),\n",
" (1049, 0.6),\n",
" (150, 0.6),\n",
" (266, 0.6),\n",
" (381, 0.6),\n",
" (361, 0.6),\n",
" (378, 0.6),\n",
" (144, 0.8),\n",
" (151, 0.6),\n",
" (155, 0.6),\n",
" (161, 0.6),\n",
" (597, 0.8),\n",
" (440, 0.6),\n",
" (1231, 0.6),\n",
" (11, 0.8),\n",
" (480, 0.6),\n",
" (1271, 0.6),\n",
" (587, 0.8),\n",
" (246, 0.8),\n",
" (953, 0.6),\n",
" (1103, 0.8),\n",
" (1042, 1.0),\n",
" (1127, 0.6),\n",
" (31, 0.8),\n",
" (377, 0.8),\n",
" (919, 0.6),\n",
" (1223, 1.0),\n",
" (353, 0.6),\n",
" (539, 0.8),\n",
" (1225, 0.8),\n",
" (1307, 1.0),\n",
" (1090, 1.0),\n",
" (1207, 0.6),\n",
" (1259, 0.8),\n",
" (1097, 1.0),\n",
" (1270, 1.0),\n",
" (344, 0.6),\n",
" (1136, 0.6),\n",
" (260, 0.8),\n",
" (457, 1.0),\n",
" (1196, 1.0),\n",
" (1240, 0.8),\n",
" (1198, 1.0),\n",
" (1210, 0.8),\n",
" (257, 0.6),\n",
" (339, 0.8),\n",
" (1101, 0.8),\n",
" (23, 0.6),\n",
" (165, 0.6),\n",
" (276, 0.6),\n",
" (315, 0.6),\n",
" (340, 0.8),\n",
" (1036, 1.0),\n",
" (110, 1.0),\n",
" (474, 0.8),\n",
" (368, 0.8),\n",
" (551, 0.6),\n",
" (1027, 0.6),\n",
" (1148, 0.8),\n",
" (225, 0.6),\n",
" (357, 0.8),\n",
" (223, 0.8),\n",
" (173, 0.6),\n",
" (198, 0.6),\n",
" (356, 1.0),\n",
" (442, 0.6),\n",
" (475, 1.0),\n",
" (1246, 1.0),\n",
" (208, 0.6),\n",
" (367, 1.0),\n",
" (434, 0.6),\n",
" (450, 0.8),\n",
" (589, 1.0),\n",
" (1197, 1.0),\n",
" (50, 0.8),\n",
" (224, 0.6),\n",
" (236, 0.8),\n",
" (292, 0.6),\n",
" (337, 0.8),\n",
" (500, 0.8),\n",
" (508, 0.6),\n",
" (595, 1.0),\n",
" (47, 1.0),\n",
" (282, 0.8),\n",
" (527, 1.0),\n",
" (588, 1.0),\n",
" (380, 0.6),\n",
" (590, 1.0),\n",
" (153, 0.6),\n",
" (592, 0.6),\n",
" (593, 1.0),\n",
" (24, 0.8),\n",
" (1265, 1.0),\n",
" (376, 0.6),\n",
" (786, 0.6),\n",
" (36, 1.0),\n",
" (1391, 0.8),\n",
" (748, 0.8),\n",
" (650, 0.6),\n",
" (839, 0.6),\n",
" (100, 0.6),\n",
" (112, 0.6),\n",
" (694, 0.6),\n",
" (852, 0.6),\n",
" (1061, 1.0),\n",
" (94, 0.6),\n",
" (74, 0.8),\n",
" (640, 0.6),\n",
" (719, 0.6),\n",
" (1356, 0.6),\n",
" (9, 0.6),\n",
" (628, 0.8),\n",
" (661, 0.8),\n",
" (762, 0.6),\n",
" (104, 0.6),\n",
" (653, 0.6),\n",
" (1073, 0.6),\n",
" (7, 0.6),\n",
" (494, 0.6),\n",
" (608, 0.2),\n",
" (733, 1.0),\n",
" (6, 1.0),\n",
" (1, 1.0),\n",
" (32, 0.8),\n",
" (95, 0.6),\n",
" (141, 0.8),\n",
" (648, 0.4),\n",
" (736, 0.6),\n",
" (780, 0.6)])]"
]
}
],
"prompt_number": 11
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"def coo_for_user((user, ratings)):\n",
" # sort on movie ID, such that for each user tho coo pair looks the same (i.e. you always get A,B and never B,A)\n",
" movie_ratings = sorted([ (movie, rating) for movie, rating in ratings], key = lambda (m,r): m)\n",
" combis = combinations(movie_ratings, 2)\n",
" \n",
" def gen(c):\n",
" while True:\n",
" (left_movie, left_rating), (right_movie, right_rating) = c.next()\n",
" yield ((left_movie, right_movie), (left_rating, right_rating)) \n",
" \n",
" return gen(combis)\n",
"\n",
"def to_similarity_struct( ((left_movie, right_movie), (left_rating, right_rating)) ):\n",
" left_square, right_square = left_rating ** 2, right_rating ** 2\n",
" rating_product = left_rating * right_rating\n",
" return (\n",
" (left_movie, right_movie),\n",
" (rating_product, left_square, right_square, 1)\n",
" )\n",
"\n",
"coo = normed_ratings_by_user.flatMap(coo_for_user).map(to_similarity_struct)\n",
"\n",
"coo.cache()\n",
"coo.count()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 12,
"text": [
"459139368"
]
}
],
"prompt_number": 12
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"coo.take(5)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 13,
"text": [
"[((1, 6), (1.0, 1.0, 1.0, 1)),\n",
" ((1, 7), (0.6, 1.0, 0.36, 1)),\n",
" ((1, 9), (0.6, 1.0, 0.36, 1)),\n",
" ((1, 11), (0.8, 1.0, 0.6400000000000001, 1)),\n",
" ((1, 23), (0.6, 1.0, 0.36, 1))]"
]
}
],
"prompt_number": 13
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"def similarity_reducer(\n",
" (x_rating_sum_products, x_left_sum_squares, x_right_sum_squares, x_count),\n",
" (y_rating_sum_products, y_left_sum_squares, y_right_sum_squares, y_count)\n",
" ):\n",
" return (\n",
" x_rating_sum_products + y_rating_sum_products,\n",
" x_left_sum_squares + y_left_sum_squares,\n",
" x_right_sum_squares + y_right_sum_squares,\n",
" x_count + y_count\n",
" )\n",
"\n",
"def make_cosine_similarity( (pair, (sum_products, left_sum_squares, right_sum_squares, count)) ):\n",
" cosine = sum_products / (sqrt(left_sum_squares) * sqrt(right_sum_squares))\n",
" return ( pair, (cosine, count) )\n",
"\n",
"# NOTE: python tuple hashing is quirky, so we force a odd number of partitions\n",
"similarity_structs = coo.reduceByKey(similarity_reducer, numPartitions = 73)\n",
"similarities = similarity_structs\\\n",
" .filter( lambda ( pair, (sp, lsq, rsq, cnt) ): cnt >= 5 )\\\n",
" .map(make_cosine_similarity)\n",
"\n",
"similarities.cache()\n",
"similarities.count()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 14,
"text": [
"7877787"
]
}
],
"prompt_number": 14
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"similarities.take(5)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 15,
"text": [
"[((4022, 5657), (0.9105766291591555, 5)),\n",
" ((2133, 3259), (0.9568295400283606, 73)),\n",
" ((588, 1914), (0.9366517100804483, 192)),\n",
" ((2245, 3972), (0.9509624117603759, 29)),\n",
" ((150, 805), (0.967924749191815, 1612))]"
]
}
],
"prompt_number": 15
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"def expand_tuples( (pair, (sim, count)) ):\n",
" return [\n",
" (pair[0], (pair[1], sim, count)),\n",
" (pair[1], (pair[0], sim, count))\n",
" ]\n",
"\n",
"def sort_recos( (item, recos) ):\n",
" return (item,\n",
" sorted(recos, key = lambda r: r[1], reverse = True)[:10]\n",
" )\n",
"\n",
"recos = similarities\\\n",
" .flatMap(expand_tuples)\\\n",
" .groupByKey()\\\n",
" .map(sort_recos)\\\n",
" .flatMap( lambda (key, values): [ (key,) + value for value in values ] )\n",
"\n",
"recos.take(10)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 16,
"text": [
"[(4864, 913, 0.9996193910743492, 5),\n",
" (4864, 3969, 0.9961596536339333, 5),\n",
" (4864, 1282, 0.993376753949167, 6),\n",
" (4864, 3996, 0.9923825425404044, 6),\n",
" (4864, 4148, 0.9922143508916336, 5),\n",
" (4864, 7143, 0.9916885491976591, 7),\n",
" (4864, 3489, 0.9914986965863366, 6),\n",
" (4864, 7361, 0.9888526548481258, 5),\n",
" (4864, 468, 0.9882027963473393, 5),\n",
" (4864, 8368, 0.9878376651495789, 5)]"
]
}
],
"prompt_number": 16
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"recos.saveAsTextFile('/item-item-similarity')"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 17
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Let's also create a graph of item-item pairs that have some level of similarity\n",
"\n",
"- Use items as vertices\n",
"- Create edges between similar items\n",
"- Filter similarities:\n",
" - Calculate basic statistics for all similarities\n",
" - Filter out similarities less than (mean - 2 * sigma)\n",
"- Save item-item pairs as CSV"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"stats = similarities.map( lambda ((l,r), (sim,coo)): sim).stats()\n",
"stats"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 18,
"text": [
"(count: 7877787, mean: 0.936288608874, stdev: 0.0436237480853)"
]
}
],
"prompt_number": 18
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"similarities.map( lambda ((l,r), (sim,coo)): sim).reduce(min)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 19,
"text": [
"0.2710580855988268"
]
}
],
"prompt_number": 19
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"similarities.map( lambda ((l,r), (sim,coo)): sim).reduce(max)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 20,
"text": [
"1.0000000000000002"
]
}
],
"prompt_number": 20
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"similarities_filtered = similarities.filter( lambda ((l,r), (sim,coo)): sim >= stats.mean() - 2 * stats.stdev() )\n",
"similarities_filtered.count()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 21,
"text": [
"7527399"
]
}
],
"prompt_number": 21
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"edges = similarities_filtered.map( lambda ((l, r), (sim, coo)): '%d,%d' % (l, r) )\n",
"edges.saveAsTextFile('/item-graph-edges')"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 22
}
],
"metadata": {}
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment