Skip to content

Instantly share code, notes, and snippets.

@mlopatka
Last active September 25, 2016 13:06
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mlopatka/7089ec209c6941ca860f6670b84d2a60 to your computer and use it in GitHub Desktop.
Save mlopatka/7089ec209c6941ca860f6670b84d2a60 to your computer and use it in GitHub Desktop.
Prototype notebook for addon recommendation targetting new firefix users with no addon history
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Populating the interactive namespace from numpy and matplotlib\n"
]
}
],
"source": [
"import numpy as np\n",
"\n",
"from pyspark.sql import SQLContext, Row\n",
"from moztelemetry import get_pings, get_pings_properties, get_one_ping_per_client, get_clients_history, get_records\n",
"\n",
"#import plotly.plotly as py\n",
"#import plotly.graph_objs as go\n",
"\n",
"import numpy as np\n",
"import matplotlib.mlab as mlab\n",
"import matplotlib.pyplot as plt\n",
"\n",
"import urllib2, json\n",
"\n",
"%pylab inline"
]
},
{
"cell_type": "code",
"execution_count": 76,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"RAND_SEED = 239507478\n",
"ADDON_EXCLUSION_LIST = ['loop@mozilla.org', 'firefox@getpocket.com', 'e10srollout@mozilla.org', 'firefox-hotfix@mozilla.org']"
]
},
{
"cell_type": "code",
"execution_count": 97,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"119826\n",
"119826\n",
"CPU times: user 1.65 s, sys: 224 ms, total: 1.87 s\n",
"Wall time: 2min 57s\n"
]
}
],
"source": [
"%%time\n",
"freshPings_addon_DF = sqlContext.sql(\"SELECT * FROM longitudinal\")\\\n",
" .where(\"active_addons IS NOT null and size(active_addons[0]) > 0\")\\\n",
" .where(\"size(active_addons) > 1\")\\\n",
" .where(\"build IS NOT null AND build[0].application_name = 'Firefox'\")\\\n",
" .selectExpr(\"client_id as client_id\", \"active_addons[0] as active_addons\")\n",
"\n",
"# .where(\"size(active_addons) > 1\")\\\n",
"# this is a temporayr hack to get aroudn the telemetry bug!!!! and should be removed when the issue is resolved.\n",
"\n",
"freshPings_cat_DF = sqlContext.sql(\"SELECT * FROM longitudinal\")\\\n",
" .where(\"active_addons IS NOT null and size(active_addons[0]) > 0\")\\\n",
" .where(\"size(active_addons) > 1\")\\\n",
" .where(\"build IS NOT null AND build[0].application_name = 'Firefox'\")\\\n",
" .selectExpr(\"client_id as client_id\", \"geo_country[0] as geo_country\",\\\n",
" \"os as os\", \"system[0].memory_mb as sys_mem\",\\\n",
" \"system[0].virtual_max_mb as virt_mem\",\\\n",
" \"theme[0].id as theme\", \"flash_version[0] as flash\",\\\n",
" \"settings[0].blocklist_enabled as block_list_bool\",\\\n",
" \"settings[0].e10s_enabled as e10_bool\",\\\n",
" \"settings[0].telemetry_enabled as telemetry_bool\",\\\n",
" \"settings[0].default_search_engine_data.name as default_search\",\\\n",
" \"settings[0].locale as loc\",\\\n",
" \"settings[0].update.channel as channel\"\\\n",
" )\n",
"\n",
"# Thanks to Alessio, for being a BOSS\n",
"\n",
"\"\"\"\n",
"freshPings_con_DF = sqlContext.sql(\"SELECT * FROM longitudinal\")\\\n",
" .where(\"active_addons IS NOT null\")\\\n",
" .where(\"build IS NOT null AND build[0].application_name = 'Firefox'\")\\\n",
" .selectExpr(\"client_id as client_id\", \"session_length[0] as session_length\",\\\n",
" \"paint_build_displaylist_time[0] as paint_time\", \"predictor_wait_time[0] as predictor_wait_time\",\\\n",
" \"html_background_reflow_ms_2[0] as reflow_ms\", \"telemetry_memory_reporter_ms[0] as tele_rep_time\",\\\n",
" \"image_decode_latency_us[0] as im_decode_latency\", \"http_subitem_open_latency_time[0] as op_latent\"\\\n",
" )\n",
"\"\"\"\n",
"\n",
"subset_addon_rdd = freshPings_addon_DF.sample(False, 0.05, RAND_SEED).rdd.cache()\n",
"subset_addon_tok_rdd = subset_addon_rdd.map(lambda p: (p['client_id'], p['active_addons'].keys())).cache()\n",
"\n",
"# to filter by client_id\n",
"def isinSet(p):\n",
" return p in valid_ID_in_addon_Set # this must be broadcast\n",
"\n",
"def filterSystemAddons(p, ADDON_EXCLUSION_LIST):\n",
" return [x for x in p if x not in ADDON_EXCLUSION_LIST]\n",
" \n",
"subset_addon_tok_filtered_rdd = subset_addon_tok_rdd.map(lambda p: (p[0], filterSystemAddons(p[1], ADDON_EXCLUSION_LIST))).filter(lambda p: len(p[1])>0).cache()\n",
"\n",
"valid_ID_in_addon_Set = set(subset_addon_tok_filtered_rdd.map(lambda p: p[0]).collect())#set of valid target ids that were randomly chosen for addons\n",
"sc.broadcast(valid_ID_in_addon_Set)\n",
"\n",
"subset_cat_rdd = freshPings_cat_DF.rdd.filter(lambda p: isinSet(p[0])).cache()\n",
" \n",
"#print subset_cat_rdd.count()\n",
"#print subset_addon_tok_filtered_rdd.count()\n",
"#print subset_addon_rdd.take(1)\n",
"#print subset_cat_rdd.take(1)\n",
"#print freshPings_addon_DF.count()\n",
"#print freshPings_cat_DF.count()\n",
"#print freshPings_addon_DF.count()\n",
"#print freshPings_addon_DF.groupBy(\"client_id\").count().count()\n",
"#print subset_addon_tok_filtered_rdd.countByKey()\n",
"#print subset_cat_rdd.countByKey()\n",
"#print sum(i == 2 for i in num_unique_add.itervalues())\n",
"#print sum(i == 2 for i in num_unique_cat.itervalues())"
]
},
{
"cell_type": "code",
"execution_count": 114,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"119826 records parsed including 311484 installed addons\n",
"CPU times: user 344 ms, sys: 84 ms, total: 428 ms\n",
"Wall time: 2.56 s\n"
]
}
],
"source": [
"%%time\n",
"# implement some standard nlp functions to work with addons\n",
"\n",
"def addon_frequency(add_on_list):\n",
" # we can cheat a bit here becasue we know the tf (token frequency is always 1/number_of_elements)\n",
" addict = dict()\n",
" l = len(add_on_list)\n",
" for i in add_on_list:\n",
" addict[i] = float(1)/l # quack quack!\n",
" return addict # the dictionary of addon names (keys) to frequency (value)\n",
"\n",
"def population_frequency(corpusRDD):\n",
" total_pings = corpusRDD.count()\n",
" addons_intf = corpusRDD.flatMap(lambda p: list(set(p[1]))) # in principal the <list(set())> is unnecessary but lets be safe!\n",
" addonCountPairRDD = addons_intf.map(lambda p: (p,1)) # creating a pairRDD with integer 1's all over the values\n",
" addonSumPairRDD = addonCountPairRDD.reduceByKey(add)\n",
" population_frequency_addons = addonSumPairRDD.map(lambda p: (p[0], float(total_pings)/p[1])) #dividing the lexicon count by the frequency\n",
" return population_frequency_addons\n",
"\n",
"def addonRarity(addons_freq): \n",
" # corpusIDFsBroadcast MUST be a broadcast variable or everything breaks down!\n",
" # it must be a dict of {addon_name, frequency})\n",
" addon_dict_intf = dict()\n",
" \n",
" t = float(1)/len(addons_freq)#if the possibility of duplicates exists then: <float(1)> becomes <float(addons_frequency.count(i))/len(addons_frequency) and goes inside the loop\n",
" for i in addons_freq:\n",
" addon_dict_intf[i] = t*corpus_addons_IDFs_Broadcast.get(i,1)\n",
" return addon_dict_intf\n",
"\n",
"def convertKeyCombinedIDs(p):# need to combine the keys ping_ids into a concatenated string for the comparison \n",
" # this will allow quick exploration of the \n",
" source_1_id = str(p[0][0])\n",
" source_2_id = str(p[0][1])\n",
" similarity_score = p[1]\n",
" return (source_1_id + ' ' + source_2_id, similarity_score)\n",
" \n",
"def count_entities(p):\n",
" tally_counter = p.map(lambda q: len(q[1])).reduce(add)\n",
" return tally_counter\n",
"\n",
"def exclude_empty_entities(p):\n",
" return bool(p[1])#how pythonic \n",
"\n",
"def addonID2addonName(a):\n",
" return addonIdsToNames_broadcast[a]\n",
"\n",
"recCNT = subset_addon_tok_filtered_rdd.count()\n",
"addonCNT = count_entities(subset_addon_tok_filtered_rdd)\n",
"\n",
"print str(recCNT) + ' records parsed including ' + str(addonCNT) +' installed addons' "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### We will borrow many concepts from NLP here. but we have it easy, becasue our lexicon is only the total number of possible add-ons, so calculating IDF (inverse document frequency) and TF (token frequency) we can convert our add-ons list to a numerical vector representing each addon list as a vector of numerical weights favouring rare add-ons (in pairwise comparisons between users)."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### This will take a long time! we use the superset that our small sample was drawn from to ensure that the set esclusion is empty. Then calculate the frequency of addons globally within the freshest part of the longitudinal corpus, once again excluding pings wiht no add-ons for now."
]
},
{
"cell_type": "code",
"execution_count": 112,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 444 ms, sys: 104 ms, total: 548 ms\n",
"Wall time: 2min 24s\n"
]
}
],
"source": [
"%%time\n",
"corpusTokenizedRDD = freshPings_addon_DF.map(lambda p: (p['client_id'], p['active_addons'].keys()))\n",
"corpusIDFs = population_frequency(corpusTokenizedRDD)\n",
"## DISCUSSION POINT: should we include the subsample in the corpus:\n",
"# option 1: no, but then filter out addons we have not seen yet in the corpus, from the sample\n",
"# option 2: yes leave the corpus as the super set and risk overfitting\n",
"corpus_addons_IDFs_Broadcast = corpusIDFs.collectAsMap()\n",
"sc.broadcast(corpus_addons_IDFs_Broadcast) # this is really important, otherwise we are sending this to each worker FOR EACH partition!!!"
]
},
{
"cell_type": "code",
"execution_count": 103,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 348 ms, sys: 0 ns, total: 348 ms\n",
"Wall time: 1.37 s\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAbYAAAEZCAYAAAD7QwBLAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzt3Xm4HFWd//H3J4EQtgQQSSRAREQIuCAjyIjKdYEISII6\nIriwOZgRIyqjsrgQGHQGxp8gIiiLGEAMkUWDIgSEi6JsCmExIYQBkhBIQBbDJpDw/f1xTieVppe6\nN7fv0vfzep5+bi3nnDp1um9/+1SdqlJEYGZm1i6G9HUFzMzMepIDm5mZtRUHNjMzaysObGZm1lYc\n2MzMrK04sJmZWVtxYBuEJB0n6YIeKutdku6TtFTShJ4ocyCRNFzSFZKelnRxi7bRY+9XT5N0j6T3\n9tG2G7aLpAclvb8362T9gwPbANGCf9KeuoDxBOC0iBgRETN6qMyB5N+A1wIbRsQnVrcwSbtJWlhj\nVZ9fcCrpPEknFJdFxJsj4g99VSf6QbtY/+PAZqtrLDC73kpJ6sW69IWxwH3RjTsdSBpaazFt8mVd\nZ/+sCUn+Xl5NbsABQNL5wBbAFfmQ31fz8gn5UNCTkq6TtG0hz+skXSLpMUn/J+mLdcpeQ9IvJP0y\nTx8n6WJJU/O27pa0Y5289wNbAr/JadeUdL2kEyXdKOk5YEtJIySdK+kRSQsl/Vcl4EkaIul7kh6X\ndL+kwyW9Uvnnru6pVh9+krSLpD9JekrSHZJ2K6y7XtIJuS5LJV0laaPC+ncX8s6XdKCkd0haXAzI\nkj4qaVaN/Z8CfBvYP5d/iJJvSnool/MzSSNy+rF53w6VNB/4fVV56wBXAptKeiaXOTqvXqvee1L2\nvc5ph+X2ni/pUUlnSForr9stvz9HSloiaZGkg/O6w4BPAV/Pdfh19fuT35tfSrpA0tPAQbk9js7v\n7eOSpknaoE7dNlA6rPuYpCfy9KaF9a+X1CnpH5KuBjauyv+Z3O6PSzq2xn6fmvfpYUmnSFqz2X7n\n9XtJ+lve74WSjmzQvodJmp3T3iNph7x82/x5fCq/f/sU8pyX34ffSnoG6MjLzpQ0M5d1vaQtcvrK\n52hIoYzrJR2ap7fK7fR0bstf1Ktv24oIvwbAC3gQeF9h/k3As8D7gaHA14B5wBqkX/1/Ab6R170e\nuB/YPec9DjgfGA78BjgXUGHd88D4XM53gZu6UK/rgYeAbUk/nNYALgfOyNvbGLgZOCyn/w9Sj29T\nYAPgOmA5MKRQ/vsL5R8HnJ+nxwB/B8bn+Q/k+dcU6jIP2ApYK89/N68bCywF9stttCHw1rzunkqZ\nef4y4Mt19n9FffL8ocB9ufx1gEsL9R0LvAL8DFgbWKtGebsBC2pso+Z70uy9rlH+KcCvgJHAusCv\nge8Utv1y3t5QYE/gOWBkXn8ecEKN9//9hXq+COyT59cCvgT8GXgdsCZwJnBRnbptBHwk51sXuBi4\nvLD+z8D/5nLek9+/SttuBzwD7JrX/z/gpULdTsj5X5NffwKOL7nfjwDvytMjgR3q1P/jwEJgxzz/\nBmBz0v/APOCoPP2+XPetC+36FLBLod3OA/5R2J9TgT8WPkcr/kcKn/VD8/RFwDF5elil7oPp1ecV\n8KvkG/XqL/hvAtMK88r/VO8FdgYeqsp/NHBunj4uf6F1AqdUpTsOmFmYHwc814V6XQ9MKcxvAvyT\nwpc4sD/w+zz9e+BzhXW7Uz6wfR2YWlWfq4DPFOpybGHd54ErC+1xaZ19+jpwYZ7eKH/JjaqTtjqw\nXQv8R2H+TaQv2CGFL6SxDdqzXmCr+Z4A72z0Xtco/1lgy8L8vwIPFLb9XNUX5hJg5zxdJrB1Vq2f\nzao/fF5XaY8Sn/kdgCfy9BY539qF9T8vfBa+RSFgkn5UvFio2/2s+mNljy7s90PAYcD6Tep7FfDF\nGsvfDTxStewi4NuFdv1Z1frzqvZnXWAZ6cdcs8A2FfgxMKZZG7fraw1soNoUmF+ZiYiQ9DDpg78M\nGCPpybxapC/W4kn+XUi/HvevUfbiwvTzwHBJQyLilZJ1Kw5+GEv6xfmo8tHH/FpQ2I9i+vmUNxbY\nr3BYR6R9Kh7iq96X9fL05sD/1Sn3QmC2pLVJPbo/RMSSknVa5X3J02sAowrLHi5ZVlHN94T0hd/s\nvU4rpNeSvvD/qpVHWofkPBVPVL3PxTYro3rgy1jgckmVMkXqHY0CHq2q39qknsl4Uu9dwHpKlX0d\n8FREvFDIMh/YLE+v8jmKiOclPVFIuykrP3OVvJsW5hvt98dIgfMkSXeSekM319j3ep+p6s94Zftj\nCvO1BgwV9+e5/B5vCjxWI23R14ATgVtznu9HxHlN8rQVB7aBI6rmHwHeXLVsc2AR6ZftAxGxTYPy\nrgbuAq6T1BERzf5ZuqJY14WkHttrIv+crPIoqd4VY6vWP0f6Mq4YXZheSPrFPqkbdVxI6tm+SkQ8\nIukm0hfap0mHUct6hFX3YSzpi3wJK/ezVjtQYl0tC2n+Xlf8nfSFvX1EPNoscQ1l6ladZgGpJ3FT\nibz/CWwN7BQRj0t6G3A7KcA9Cmwoae1CcNuCdGiXvL54jnkd0iHHisr7MifPj83Lmu9QxF+BfZUG\nw3wRmJ63XW0h6bB3tUdY9TNeqfvc4mZq5FuRR9J6pKMHlf9vSP8Xz+bpFf8X+X/5cznfrsC1km6I\niAdq7V878uCRgWMx6Zh9xXRgb0nvUxr08VVSAPkzcCvwjKSvK11nNVTS9pLeUSwwIr5HOiTye0nF\nL4Fq3R7ZGBGLgZnAKZLWz4MJ3qCV1z5NB46QNEbShqTzEEWzSIMz1sj1/7fCuguBfSTtoTQIZXge\nCLApzf0c+ICkf8vts1H+Iq24gHRI8s2kc2xl/QL4Sh7osB7wHdIh42KPpZElwGuUB5w0UCmn1HsN\nqVcPnA2cmntv5Hbfo+S+LWHVz2AZPwG+Wxj48FrVv95xfeAFYKnSIJ8phbovIJ1LPF5pkNK7gX0K\neS8BPqx0XeWapHNqxbb+BfBNSRtL2pjUA2t6bWDe1icljYiI5aTzeMvrJD8H+KrywJ48iGNz4Bbg\n+fwerSGpA/hwrlMje+X9GQb8F+m86iMR8XdSgPt0/twfSiGg5s90pTf4NCn4lz3a0hYc2AaO/wG+\npTQC8siIuI/UmzgdeBzYm3TSfln+Ev0w6RzFg6RDF2cDr/qyjIgTSYMJrlGd0Wp0rYdRK+2BpJPY\ns4EngV+y8hfm2aTe452kL65Lq/J+C3hjznccKSBV6v4wMBE4ltQG84GvsvJzXbfeEbEQ2CunfxK4\nA3hrIcnlpF/1l0XEP+uVU8NPSV+YfyAdlnoeOKK46UaZI2Iu6Qvvgfxej66XNKcv/V5nR5HON92s\nNHJxJuk8YN0qFabPBbbP9bqsxvpafkA6nztT0j9IP7xq9pRJhyHXIfUs/0waIVr0SdIh9CdIn4up\nKyoZMRv4AqntHslpiod8TyR9vu5i5WftOw3qXdyvzwAP5vb6XK7HqzNEXJLLvEjSUtJnaKOIeJkU\nhPfK+3Y66TzwvBrbKrqIFNyfAN5O+n+vOIz0w+vvpHOufyqs2wm4JdfhV8AREfFQg31tO5WRcK3b\ngPQh0gd2COmE9kk10pzGypFIB0fErEZ58y/7i0lfPA8B+0XEP/K6Y0gj05YBX4qImfnY/S9Jv2qW\nAb+JiGNy+oNII60q/wSnR8RPe7odrBxJY4EHgDW7cE6vVXW5nzSw5bq+rIcNPpLOAxZGxLf7ui4D\nUUt7bPnk9umkk8HbAweocK1VTrMnsFVEbA1MIo3maZb3aODafF7hOqASpLYjnewfRwqUZ2jlWfL/\njYhxpF8+u0oaX6jGtIjYMb8c1Ppen1/ULeljwCsOamYDT6sPRe4MzIuI+bk7Po106KhoIumaKiLi\nFmCkpFFN8k5k5WGIqcC+eXoCKUgty13veaQhuy9ExA15G8tIJ6Qro6mgH3yR2ipaexihCUnXAz8C\nDu/Letig1qf/AwNdq0dFjmHVYawP8+rj67XSjGmSd1Rl+HVELJa0SaGs4uirRaw6pJZ8Hmkf0iHO\nio/mwQxzgSPzuRvrAxExn3SRbF/W4X19uX2ziDi0r+swkPXHwSPd6T2V+nWTh+teBJxaOJk6A3h9\nRLyNdHHt1DrZzcxsAGh1j20Rq17vsVleVp1m8xpphjXIu1jSqIhYkkeNVa7BqldWxVnA3Ij4YWVB\nRDxVWH8OcHKtHZHkQwNmZt0QEb16uqfVPbbbgDcq3bRzGOkuF9WPNplBGg6OpF2Ap/NhxkZ5ZwAH\n5+mDSMOJK8v3V7rh6ZakYeK35rJPBEZExFeKG68aTj2RBneq7+vbxPSX13HHHdfndegvL7eF28Jt\n0fjVF1raY4uI5ZImk66VqQzZnyNpUlodZ0XElUp3z76fNNz/kEZ5c9EnAdPzhYnzSSMhiYjZkqaT\ngtPLwOEREflixWOBOZLuIB26rAzrPyJfMPoy6Xqmg1vZJmZm1lotv6VWRFwFbFO17CdV85PL5s3L\nnwQ+WCfPfwP/XbVsEXV6pxFxLCnomZlZG+iPg0esn+vo6OjrKvQbbouV3BYruS36VsvvPNIuJIXb\nysysayQRbTZ4xMzMrFc5sJmZWVtxYDMzs7biwGZmZm3Fgc3MzNqKA5uZmbUVBzYzM2srDmxmZtZW\nHNjMzKytOLCZmVlbcWAzMxukxu87nnE7jWP8vuP7uio9yoHNzGyQWrBoAaMnj2bBogV9XZUe5cBm\nZmZtxYHNzMzaigObmZm1FQc2MzNrKw5sZmbWVhzYzMysrTiwmZlZW3FgMzOztuLAZmZmbcWBzczM\n2ooDm5mZtRUHNjMzaysObGZm1lYc2MzMrK04sJmZWVtxYDMzs7biwGZmZm3Fgc3MzNqKA5uZmbUV\nBzYzM2srDmxmZtZWHNjMzKyttDywSfqQpHsl3SfpqDppTpM0T9IsSTs0yytpQ0kzJc2VdLWkkYV1\nx+Sy5kjaIy9bW9Jv8rK7JX23kH6YpGk5z02StmhNS5iZWW9oaWCTNAQ4HRgPbA8cIGnbqjR7AltF\nxNbAJODHJfIeDVwbEdsA1wHH5DzbAfsB44A9gTMkKef534gYB7wdeLek8Xn5Z4En8/ZPBU7u2VYw\nM7Pe1Ooe287AvIiYHxEvA9OAiVVpJgLnA0TELcBISaOa5J0ITM3TU4F98/QEYFpELIuIh4B5wM4R\n8UJE3JC3sQy4HdisRlmXAB/okT03M7M+0erANgZYWJh/OC8rk6ZR3lERsQQgIhYDm9Qpa1H19iRt\nAOwDXFudJyKWA09L2qjc7pmZWX+zRl9XoAY1T/IqUapgaShwEXBqRMzv6vanTJmyYrqjo4OOjo7y\nNTQzGwQ6Ozvp7Ozs0zq0OrAtAoqDMTbLy6rTbF4jzbAGeRdLGhURSySNBh5rUlbFWcDciPhhYdnD\nOc8jOfCNiIgna+1MMbCZmdmrVf/oP/7443u9Dq0+FHkb8EZJYyUNA/YHZlSlmQEcCCBpF+DpfJix\nUd4ZwMF5+iDg14Xl++eRjlsCbwRuzWWfSApaX6na/hW5DICPkwajmJnZANXSHltELJc0GZhJCqLn\nRsQcSZPS6jgrIq6UtJek+4HngEMa5c1FnwRMl3QoMJ80EpKImC1pOjAbeBk4PCJC0hjgWGCOpDtI\nhy5Pj4ifAucCF0iaBzxBCqBmZjZAKaLU6alBT1K4rcysnYzbaRyjJ49m8emLmXPbnOYZukESEdGd\nsRPd5juPmJlZW3FgMzOztuLAZmZmbcWBzczM2krTUZH5no1vAzYFXgDuiYjHGucyMzPrG3UDm6St\ngKOAD5Luufg4MBx4k6TngZ8AUyPild6oqJmZWRmNemwnAmcCk6rHuUvaBPgk8BlW3kDYzMysz9UN\nbBFxQIN1j5Ee8WJmZtavNDoU+dFGGSPisp6vjpmZ2eppdChyn/x3E+BdrLyH4vuAPwMObGZm1u80\nOhR5CICkmcB2EfFonn8d8LNeqZ2ZmVkXlbmObfNKUMuWsOrjZMzMzPqNMnf3/72kq4Ff5PlPsPLp\n02ZmZv1K08AWEZMlfQR4b150VkRc3tpqmZmZdU/Z57HdDjwTEddKWkfS+hHxTCsrZmZm1h1Nz7FJ\nOgy4hHSnEYAxwK9aWSkzM7PuKjN45AvArsBSgIiYR7oEwMzMrN8pE9hejIiXKjOS1gD8KGkzM+uX\nygS2GyQdC6wtaXfgl8AVra2WmZlZ95QJbEeT7ux/NzAJuDIivtHSWpmZmXVTmVGRnwKmRcTZlQWS\nPhwRv2ldtczMzLqnTI/th8AfJY0rLDuhRfUxMzNbLWUC24PAocAlkj6el6l1VTIzM+u+MociIyJu\nl7Qb8AtJ7wSGtrheZmZm3VKmx/YoQET8HRhPGur/5lZWyszMrLuaBraI2Lsw/UpEfC0iygREMzOz\nXtfoCdqnRsSXJV1BjQuyI2JCS2tmZmbWDY3OsV2Q/36vNypiZmbWExo9Qfuv+e8NvVcdMzOz1dPo\nUOTdNLgnZES8tSU1MjMzWw2NDkV+uNdqYWZm1kMaHYqc35sVMTMz6wllHjS6i6TbJD0r6SVJyyUt\n7Y3KmZmZdVWZ69FOBw4A5gFrA/8O/KiVlTIzM+uuUhdaR8T9wNCIWB4R5wEfam21zMzMuqdMYHte\n0jBglqSTJX2lZD4AJH1I0r2S7pN0VJ00p0maJ2mWpB2a5ZW0oaSZkuZKulrSyMK6Y3JZcyTtUVh+\noqQF1YdRJR0k6TFJt+fXoWX3zczM+p8yAeozpJseTwaeAzYHPlamcElDSIcyxwPbAwdI2rYqzZ7A\nVhGxNelBpj8ukfdo4NqI2Aa4Djgm59kO2A8YB+wJnCGp8iSCGcBOdao6LSJ2zK+fltk3MzPrn5re\n3b8wOvIF4Pgulr8zMK9ShqRpwETg3kKaicD5eVu3SBopaRSwZYO8E4Hdcv6pQCcp2E0gBallwEOS\n5uU63BIRt+ZyatXTj+ExM2sTZUZFfljSHZKelLRU0jNdGBU5BlhYmH84LyuTplHeURGxBCAiFgOb\n1ClrUY3t1fJRSXdKmi5psxLpzcysnypzKPJU4CDgNRExIiLWj4gRLaxTd3pPde+QUsIM4PUR8Tbg\nWlIP0MzMBqgyDxpdCNwTEd0JHouALQrzm+Vl1Wk2r5FmWIO8iyWNioglkkYDjzUpq66IeKowew5w\ncr20U6ZMWTHd0dFBR0dHo6LNzAadzs5OOjs7+7QOahavJO0E/BdwA/BiZXlEfL9p4dJQYC7wAdID\nS28FDoiIOYU0ewFfiIi9Je0CnBoRuzTKK+kk4MmIOCmPltwwIo7Og0d+DryTdAjyGmDrYlCW9ExE\nrF+YH50PZyLpI8DXIuJdNfalm7HdzKx/GrfTOEZPHs3i0xcz57Y5zTN0gyQiolfHMZTpsX0HeBYY\nTupFlRYRyyVNBmaSDnuemwPTpLQ6zoqIKyXtJel+0qjLQxrlzUWfBEzPQ/Pnk0ZCEhGzJU0HZgMv\nA4dXolEOhp8E1pa0ADgnIk4AjpA0Iad/Eji4K/toZmb9S5ke2z0R8eZeqk+/5R6bmbWbdu2xlRk8\ncmXxQmczM7P+rExg+zxwlaQXujHc38zMrFc1PMeW79qxfUQs6KX6mJmZrZaGPbZ8Uum3vVQXMzOz\n1VbmUOTteci/mZlZv1dmuP87gU9Jmk8aji9SZ+6tLa2ZmZlZN5QJbONbXgszM7Me0vRQZL67/gbA\nPvm1QeGO/2ZmZv1Kmbv7f4l0m6pN8utCSV9sdcXMzMy6o8yhyM8C74yI52DFraluAn7YyoqZmZl1\nR5lRkQKWF+aX4wdzmplZP1Wmx3YecIuky/P8vsC5rauSmZlZ9zUNbBHxfUk3ALvmRYdExB2trZaZ\nmVn3lOmxAcwiPRNtDQBJW/g2W2Zm1h81DWx5BORxwBJWnl8LwBdom5lZv1Omx/YlYJuIeKLVlTEz\nM1tdZUZFLgT+0eqKmJmZ9YQyPbYHgE5JvwVerCyMiO+3rFZmZmbdVCawLcivYfllZmbWb5UZ7n98\nb1TEzMysJ9Q9xybpbElvqbNuXUmHSvpU66pmZmbWdY16bD8CvpWD2z3A48BwYGtgBPBT0s2RzczM\n+o26gS0iZgH7SVoPeAfwOuAFYE5EzO2l+pmZmXVJmXNszwKdra+KmZnZ6itzHZuZmdmA4cBmZmZt\npcwTtGuOjDQzM+uPyvTYzpB0q6TDJY1seY3MzMxWQ9PAFhHvAT4FbA78VdJFknZvec3MzMy6odQ5\ntoiYB3wTOArYDThN0r2SPtrKypmZmXVVmXNsb5V0CjAHeD+wT0SMy9OntLh+ZmZmXVLmJsg/BM4B\njo2IFyoLI+IRSd9sWc3MzMy6oUxg2xt4ISKWA0gaAgyPiOcj4oKW1s7MzKyLypxjuxZYuzC/Tl5m\nZmbW75QJbMPzbbWAFbfYWqd1VTIzM+u+MoHtOUk7VmYk/QvpZshmZmb9TpnA9mXgl5L+KOlG4GJg\nctkNSPpQvjTgPklH1UlzmqR5kmZJ2qFZXkkbSpopaa6kq4sXjks6Jpc1R9IeheUnSlogaWnVtodJ\nmpbz3CRpi7L7ZmZm/U+ZC7RvA7YFPg/8BzAuIv5apvA80OR0YDywPXCApG2r0uwJbBURWwOTgB+X\nyHs0cG1EbANcBxyT82wH7AeMA/Yk3TVFOc8MYKca1fws8GTe/qnAyWX2zczM+qeyN0HeCXgrsCMp\nwBxYMt/OwLyImB8RLwPTgIlVaSYC5wNExC3ASEmjmuSdCEzN01OBffP0BGBaRCyLiIeAebkcIuLW\niFhSo47Fsi4BPlBy38zMrB9qOtxf0gXAVsAsYHleHORg1MQYYGFh/mFyoGmSZkyTvKMqQSoiFkva\npFDWTYU8i/KyUnWMiOWSnpa0UUQ82SSfmZn1Q2WuY3sHsF1ERKsrk6l5klfpybp1Z/tmZtZPlAls\n9wCjgUe7Uf4ioDgYY7O8rDrN5jXSDGuQd7GkURGxRNJo4LEmZTXycM7ziKShwIh6vbUpU6asmO7o\n6KCjo6NJ0WZmg0tnZyednZ19Wgc164hJuh7YAbgVeLGyPCImNC08BYq5pPNWj+YyDoiIOYU0ewFf\niIi9Je0CnBoRuzTKK+kk0oCPk/JoyQ0j4ug8eOTnwDtJhxivAbYu9jYlPRMR6xfmDwfeHBGHS9of\n2Dci9q+xL73YaTUza71xO41j9OTRLD59MXNum9M8QzdIIiJ69UhYmR7blO4Wns9ZTQZmkgaqnJsD\n06S0Os6KiCsl7SXpfuA54JBGeXPRJwHTJR0KzCeNhCQiZkuaDswGXgYOr0SjHAw/CawtaQFwTkSc\nAJwLXCBpHvAE8KqgZmZmA0fTHhuApLGkns+1ktYBhkbEMy2vXT/iHpuZtZt27bGVeWzNYaRh8D/J\ni8YAv2plpczMzLqrzHVsXwB2BZbCioeObtIwh5mZWR8pE9hejIiXKjOS1qBnh9ebmZn1mDKB7QZJ\nx5IGXewO/BK4orXVMjMz654yge1o4HHgbtK9HK8E/ORsMzPrl5oO94+IV4Cz88vMzKxfK3OvyAep\ncU4tIt7QkhqZmZmthrL3iqwYDnwc2Kg11TEzM1s9ZZ7H9kThtSgiTgX27oW6mZmZdVmZQ5E7FmaH\nkHpwZXp6ZmZmva5MgPp/hellwEPkezOamZn1N2VGRb6vNypiZmbWE8ocijyy0fqI+H7PVcfMzGz1\nlB0VuRMwI8/vQ3o22rxWVcrMzKy7ygS2zYAdK4+pkTQF+G1EfLqVFTMzM+uOMrfUGgW8VJh/KS8z\nMzPrd8r02M4HbpV0eZ7fF5jauiqZmZl1X5lRkd+R9DvgPXnRIRFxR2urZWZm1j1lDkUCrAMsjYgf\nAA9L2rKFdTIzM+u2poFN0nHAUcAxedGawIWtrJSZmVl3lemxfQSYADwHEBGPAOu3slJmZmbdVSaw\nvRQRQX50jaR1W1slMzOz7isT2KZL+gmwgaTDgGvxQ0fNzKyfKjMq8nuSdgeWAtsA346Ia1peMzMz\ns25oGNgkDQWuzTdCdjAzM7N+r+GhyIhYDrwiaWQv1cfMzGy1lLnzyLPA3ZKuIY+MBIiII1pWKzMz\ns24qE9guyy8zM7N+r25gk7RFRCyICN8X0szMBoxG59h+VZmQdGkv1MXMzGy1NQpsKky/odUVMTMz\n6wmNAlvUmTYzM+u3Gg0eeZukpaSe29p5mjwfETGi5bUzMzProrqBLSKG9mZFzMzMekLZ57GZmZkN\nCA5sZmbWVloe2CR9SNK9ku6TdFSdNKdJmidplqQdmuWVtKGkmZLmSrq6eMsvScfksuZI2qOwfEdJ\nd+WyTi0sP0jSY5Juz69De74VzMyst7Q0sEkaApwOjAe2Bw6QtG1Vmj2BrSJia2AS8OMSeY8m3Zx5\nG+A68tO9JW0H7AeMA/YEzpBUuWzhTOCzEfEm4E2SxheqMS0idsyvn/ZoI5iZWa9qdY9tZ2BeRMyP\niJeBacDEqjQTgfMBIuIWYKSkUU3yTgQqd0SZCuybpyeQgtSyiHgImAfsLGk0sH5E3JbTnV/IA6te\ns2dmZgNYqwPbGGBhYf7hvKxMmkZ5R0XEEoCIWAxsUqesRYWyHm5Qj49KulPSdEmblds1MzPrj8rc\nBLm3daf3tDoXkM8ALoqIlyV9jtQD/ECthFOmTFkx3dHRQUdHx2ps1sys/XR2dtLZ2dmndWh1YFsE\nbFGY3ywvq06zeY00wxrkXSxpVEQsyYcZH2tSVr3lRMRTheXnACfX25liYDMzs1er/tF//PHH93od\nWn0o8jbgjZLGShoG7E/qIRXNAA4EkLQL8HQ+zNgo7wzg4Dx9EPDrwvL9JQ2TtCXwRuDWfLjyH5J2\nzoNJDqzkyYGxYiIwu2d23czM+kJLe2wRsVzSZGAmKYieGxFzJE1Kq+OsiLhS0l6S7ic9yPSQRnlz\n0ScB0/PQ/PmkkZBExGxJ00nB6WXg8IioHKb8AvAzYDhwZURclZcfIWlCTv8kKwOmmZkNQFr5vW+N\nSAq3lZm1k3E7jWP05NEsPn0xc26b0zxDN0giInp15LnvPGJmZm3Fgc3MzNqKA5uZmbUVBzYzM2sr\nDmxmZtZLr9qeAAANfklEQVRWHNjMzKytOLCZmVlbcWAzM7O24sBmZmZtxYHNzMzaigObmZm1FQc2\nMzNrKw5sZmbWVhzYzMysrTiwmZlZW3FgMzOztuLAZmZmbcWBzczM2ooDm5mZtRUHNjMzaysObGZm\n1lYc2MzMrK04sJmZWVtxYDMzs7biwGZmZm3Fgc2sHxm/73jG7TSO8fuO7+uqmA1YDmxm/ciCRQsY\nPXk0CxYt6OuqmA1YDmxmZtZWHNjMzKytOLCZmVlbcWDrgrPPPps5c+b0dTXMzKwBB7Yu+O6V3+Wc\nC87p62qYmVkDDmxdMPJNI/u6ClbFw+PNrJoDmw1oHh5vZtUc2MzMrK20PLBJ+pCkeyXdJ+moOmlO\nkzRP0ixJOzTLK2lDSTMlzZV0taSRhXXH5LLmSNqjsHxHSXflsk4tLB8maVrOc5OkLXq+FQYnHyY0\ns77Q0sAmaQhwOjAe2B44QNK2VWn2BLaKiK2BScCPS+Q9Grg2IrYBrgOOyXm2A/YDxgF7AmdIUs5z\nJvDZiHgT8CZJlW/bzwJP5u2fCpzcaJ8u/dWljNtpHBuP3bitv7QbBaXOzs5SZQyGw4Rl22IwcFus\n5LboW63use0MzIuI+RHxMjANmFiVZiJwPkBE3AKMlDSqSd6JwNQ8PRXYN09PAKZFxLKIeAiYB+ws\naTSwfkTcltOdX8hTLOsS4AONdmjp80sZPXk0zzz/DKMnj+bGm25cJdBV/haDQl/0XOpts2xdGgUl\n/9Ou5LZYyW2xktuib7U6sI0BFhbmH87LyqRplHdURCwBiIjFwCZ1ylpUKOvhOmWtyBMRy4GnJW1U\nbvdg2SvLVgl0lb/FoHfjLTfWDIL1gksl+FSnqxWU6gWqeoFpMPSizGxw64+DR9Q8yatEb2z/xVkv\noi5UrxL0li1btsp8s95eJRBW0lWCUK2g5EBlZlYlIlr2AnYBrirMHw0cVZXmx8AnCvP3AqMa5QXm\nkHptAKOBObXKB64C3llMk5fvD5xZTJOnhwKP1dmX8Msvv/zyq+uvVsaZWq81aK3bgDdKGgs8Sgoo\nB1SlmQF8AbhY0i7A0xGxRNLfG+SdARwMnAQcBPy6sPznkk4hHWJ8I3BrRISkf0jaOdfpQOC0Qp6D\ngFuAj5MGo7xKRHSnJ2lmZr2spYEtIpZLmgzMJB32PDci5kialFbHWRFxpaS9JN0PPAcc0ihvLvok\nYLqkQ4H5pJGQRMRsSdOB2cDLwOGRu1uk4PkzYDhwZURclZefC1wgaR7wBCmAmpnZAKWV3/tmZmYD\nX38cPNLvlLnIvL+SdK6kJZLuKizrlQvcJR2U08+VdGBh+esl3ZzX/UJSqw+JI2kzSddJ+pukuyUd\nMYjbYi1Jt0i6I7fFcYO1LQrbHiLpdkkz8vygbAtJD0m6M382bs3LBl5b9PZJvYH2IgX/+4GxwJrA\nLGDbvq5XF+r/bmAH4K7CspOAr+fpo4D/ydPbAXeQDlG/Pu93pVd/C7BTnr4SGJ+nPw+ckac/QbqO\nEGBD4P+AkcAGlem87mLg43n6TGBSL7TDaGCHPL0eMBfYdjC2Rd7WOvnvUOBm0nWjg7It8va+AlwI\nzBis/yN5Ww8AG1YtG3Bt0SsfmoH8Io3O/F1h/lUjO/v7ixSUi4HtXlYdVXpvrX0DfsfKUaWzC8ub\njiotpsnzZ5JHvwKPA0MK7XtVT+xnF9vkV8AHB3tbAOsAfwF2GqxtAWwGXAN0sDKwDda2eBB4TdWy\nAdcWPhTZXJmLzAeaTaJ1F7j/Q+kC95plSXoN8FREvFIoa9Me2q9SJL2e1Iu9mdZe7N9v2yIfersD\nWAxcE+muPIOyLYBTgK+RhqZXDNa2COAaSbdJ+ve8bMC1Ra8dw7Z+rSdHEJW5LKLPLp2QtB7p1mlf\niohnJVXv+6Boi/xF8XZJI4DLJW3Pq/e97dtC0t7AkoiYJamjQdK2b4ts14h4VNJrgZmS5jIAPxfu\nsTW3CCje8X+zvGwgW6J0P06U7qP5WF6+CNi8kK6yr/WWr5JH0lBgREQ8SZ12i4gnSPcDHVKjrJbK\nJ50vAS6IiMq1j4OyLSoiYinQCXyIwdkWuwITJD0A/AJ4v6QLgMWDsC2IiEfz38dJh+t3ZgB+LhzY\nmltxkbmkYaRjwTP6uE5dJVb91VO5wB1efYH7/nnk0pasvMB9MemQwc6SRLrAvZjnoDxdvMD9amB3\nSSMlbQjsnpcBXJ/TVm+/1X5KOvb/g8KyQdcWkjaujGyTtHauzxwGYVtExLERsUVEvIH0v31dRHwG\nuIJB1haS1slHNJC0LrAHcDcD8XPRGyckB/qL9Gt2LulpAUf3dX26WPeLgEeAF4EFpAvgNwSuzfs0\nE9igkP4Y0uimOcAeheX/kj/k84AfFJavBUzPy28GXl9Yd3Befh9wYGH5lqRRU/eRRjyt2QvtsCuw\nnDSq9Q7g9vy+bjQI2+Itef9nAXcB38jLB11bVLXLbqwcPDLo2iJvs/L/cTf5u24gtoUv0DYzs7bi\nQ5FmZtZWHNjMzKytOLCZmVlbcWAzM7O24sBmZmZtxYHNzMzaigOb9SlJx0k6sq/rUUZ+rMboGstP\nz4/5+Juk55Uef3K7pI9KOk/SA4Vlk2vk303SFYVtPCbpr/kxHb+T9K+FtE3L68Z+XS9px9Utp4vb\nHCvp7t7cZhl90RbW83yvSLPyDgbuId04eIWImAzpyxq4IiJWfDFK2gf4z4i4vEnZxQtKp0VE5Xlx\nHcBlkjoiYm5eX6a8gaBHL6KVNDTSjXVtkHOPzXqdpG8oPUzwD8A2heU7KD18cJakSwu3fdpK0jV5\n+V8kbVns5eQ0P1R+OKGkByV9N/eibpX0dklXKT3ccFIhz1fz+lla+bDNsZJmSzpL0j0531qSPga8\nA7gw95TW6sIud/v/LCI6gbOAz5UpT9J4SdML87tp5cMzz8j7u+LhojXyP1OY/pik8/L0xpIuUXpA\n6S2VXmQu/47cJn/Nt2KqLvPIvM27JH2psGpNSRfm9p4uaXhO/z+57WdJOrnJ9o+TdL6kPwIX5M/P\nuMK2r1d66OU6Sg/dvTnXc0JeP1zp4ZV/k3QZMLxe29oA0pu3rPHLL2BH4E7SrXXWJ91C58i87k7g\n3Xn6eOD7efpmYEKeHkb68llx+6O8/Ifk2/CQnin1uTz9fdJtgtYBNgYW5+W7Az/J0yLdG/DdpGfX\nvQS8Ja+7GPhknr4eeHuDfRtL4bl3edl5pIcmVm7jtX2NfMVbOR0EnFa1fiLw2zLlkZ5x9RCwdp4/\no1D/DfLfIXlf3lzYrx3z9NJCWR8Dfpqnfw68K09vTn7eFunef/+ap9chPzerxvs9HFiX1ON9W26r\nV4BdcrpzgSNJt2+6t5B/RJPtH0e6n+uwPP8lYEqeHg3MydPfKbTDSNLtodYmPWD0nLz8LcDLlbbw\na+C+fCjSett7gMsj4kXgxUJvYgTpibk35nRTgelKN2UdExEzACLipZy+2XYqvbm7gXUj4nngeUn/\nzNvag3TT1dtJgW1dYGvSM6EejIjK+Z+/kp4OXNGdx4l8LSIu60a+etusW15ELJd0FbCPpEuBvUnP\nGoN0w9rDSKcgRpOegHxPk21VfBAYp5UNv56kdYA/AadI+jlwWURU33n93aT3+58AuVf0HtL7syAi\nbs7pLgS+CPwAeEHSOcBvgd802T6kHwUv5elfkm6eOwXYj/Q0B0jv9z6SKm0xjHQ3+ffmbRIRd0u6\ns87+2wDiwGb9SVeCxjJWPSRXfQjpxfz3lcJ0ZX6NvK3/joizV6lAOk9WTL+8RtmrRdK+pJ5GAP/e\nJDnA20k3mS3rYmAy8BRwW0Q8p/Rw1f8E/iUiluZDjLX2q3jeq7hepCcfv1yV/iRJvyEF0D9J2iMi\n7utCXVfZdg7MOwMfIN3RfXKerrn9HOeeKxTwiKQnJL0F+AQwqZD8YxExr0b+VRZ1s+7Wj/gcm/W2\nPwD75vNW6wP7wIrngj0padec7jPADRHxLLBQ0kQApUdkrA3MB7aTtKakDUhffmVUvriuBg6tnBOS\ntKnSwxWLaao9A4woWX5dEfGriHh7ROwYEbc3KkPSbsBhpPNsZd1AOgR4GDAtLxsBPAs8o/RsrT3r\n5F0saRul5199pLB8JukwX6Veb8t/3xARf4uIk0mHBLetKu+PpPd7eG7rj+RlAFtIemee/iRwY+6F\nbRARV5EOTb610fbruBj4OukwZqVHejVwRCH/DnnyD8Cn8rI3F7ZnA5h7bNarIuIOSReTHpeyBLi1\nsPpg4Mc5cD1AesQOpCB3lqQTSOe/Ph4RD+VBEveQzqkVA0Sj0XaR63GNpG2Bm/Kv9meAT5N6dPXy\n/yzX73nSeaUXa6TpiacN75cD/LqkdvhooRfUtLyIeCX3og4iPQuLiLhL0ixSz28hcGMxS2H6GNIh\nwMeAvwDr5eVfAn6UD9UNJQWEw4EvS3ofqWf7N+B3VXW5Q9LPSEEvgLMi4s7cM74X+ELuPf4NOBPY\nAPh1ZSAJ6RxYo+3Xcinp8OIJhWUnAqdKuov0w+FBYELe5nmS/pbb5i91yrQBxI+tMTOztuJDkWZm\n1lYc2MzMrK04sJmZWVtxYDMzs7biwGZmZm3Fgc3MzNqKA5uZmbUVBzYzM2sr/x8qs68dbHG7BwAA\nAABJRU5ErkJggg==\n",
"text/plain": [
"<matplotlib.figure.Figure at 0x7fd32f433450>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"%%time\n",
"vals_idf = corpusIDFs.collect() # maybe we need to subsample!\n",
"temp = []\n",
"for i in vals_idf:\n",
" temp.append(i[1]) #store it on the client machine (locally) as a list\n",
"# plot with plotly \n",
"n, bins, patches = plt.hist(temp, 150, normed=1, facecolor='green', alpha=0.75)\n",
"plt.xlabel('document TF-IDF values observed')\n",
"plt.ylabel('Frequency (normalized)')\n",
"plt.title('token frequency for the entire addons corpus')"
]
},
{
"cell_type": "code",
"execution_count": 104,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Most frequently installed add-ons: [(u'loop@mozilla.org', 1.136033874857398), (u'firefox@getpocket.com', 1.3090130986013624), (u'e10srollout@mozilla.org', 1.3104437590737708), (u'firefox-hotfix@mozilla.org', 3.1053768099038606), (u'{d10d0bf8-f5b5-c8b4-a8b2-2b9879e08c5d}', 9.122419052317909), (u'{82AF8DCA-6DE9-405D-BD5E-43525BDAD38A}', 16.710666470607222), (u'cpmanager@mozillaonline.com', 23.53807255330865), (u'cehomepage@mozillaonline.com', 23.787664519027537), (u'tabtweak@mozillaonline.com', 24.439726616591834), (u'easyscreenshot@mozillaonline.com', 24.6370577566743), (u'commonfix@mozillaonline.com', 25.296760392857337), (u'coba@mozilla.com.cn', 25.884897002288838), (u'wx-assistant@mozillaonline.com', 26.258636624925327), (u'ubufox@ubuntu.com', 32.30524162795535), (u'{b9db16a4-6edc-47ec-a1f4-b86292ed211d}', 39.760254977087904), (u'wrc@avast.com', 41.63548061371293), (u'{4ED1F68A-5463-4931-9384-8FFF5ED91D92}', 44.285680340645555), (u'mozilla_cc2@internetdownloadmanager.com', 58.299154175320936), (u'vb@yandex.ru', 61.12352254391374), (u'yasearch@yandex.ru', 63.83243828259857)]\n",
"\n",
"Least popular installed add-ons: [(u'{7d4ef2e7-d197-97af-70ce-35fe64977216}', 4659385.0), (u'tdozulksvfqwrwtzydd@hdwdwaskrnozywp.com', 4659385.0), (u'{37cf493a-4e65-43e0-9f19-5f53f08c158c}', 4659385.0), (u'q7@j.edu', 4659385.0), (u'{cfefecb8-10f2-2de4-f579-77112927fd25}', 4659385.0), (u'{58F4923B-1839-4589-A6CB-1B72EF4D873C}', 4659385.0), (u'@twitter-fixer', 4659385.0), (u'FOIwvRI@gmail.com', 4659385.0), (u'{776C1E65-949E-40A3-8F05-469ADE0A22A5}', 4659385.0), (u'lhaLSke3r@w.com', 4659385.0), (u'{c5c5b4f8-2aea-9f21-80e0-cfafd27df5ac}', 4659385.0), (u'{1017a57d-4ef6-4028-b652-b305eec9a01f}', 4659385.0), (u'f@CUUx0H.com', 4659385.0), (u'{1603256b-dbdc-4737-8a69-0ab4aef7e3b9}', 4659385.0), (u'firefox-dict-switcher@danielnaber.de', 4659385.0), (u'lQ2@ZG.org', 4659385.0), (u'{27dc79cb-1da2-4f17-b6c1-234a8e734e30}', 4659385.0), (u'{3048f9e7-3731-4484-93ce-291550662c52}', 4659385.0), (u'Gr@2ELqYI61.com', 4659385.0), (u'{dbbd3679-b08d-4c12-acba-69ba6c11b79e}', 4659385.0)]\n",
"CPU times: user 80 ms, sys: 12 ms, total: 92 ms\n",
"Wall time: 2.2 s\n"
]
}
],
"source": [
"%%time\n",
"# here is a nice place to exmaine the most and least frequent addons\n",
"mostFrequent_addons = corpusIDFs.takeOrdered(20, lambda p: p[1])\n",
"print 'Most frequently installed add-ons: ' + str(mostFrequent_addons) + '\\n'\n",
"\n",
"leastFrequent_addons = corpusIDFs.takeOrdered(20, lambda p: -1*p[1])\n",
"print 'Least popular installed add-ons: ' + str(leastFrequent_addons)\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Low popularity (rare) addons may still be very useful in recomending addons for some users. I.e. if two users have a similar profile in terms of rare characteristics, that is weighted more heavily than if they are similar in terms of very common characteristics. "
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": true
},
"source": [
"### OK, now we need to compare pairwise similarity between addon weight vectore"
]
},
{
"cell_type": "code",
"execution_count": 128,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 0 ns, sys: 0 ns, total: 0 ns\n",
"Wall time: 27.2 µs\n"
]
}
],
"source": [
"%%time\n",
"import math\n",
"\n",
"def dotprod(x1,x2):\n",
" idx = list(x1.viewkeys() & x2.viewkeys())\n",
" dotProd = 0\n",
" for i in idx:\n",
" dotProd += x1.get(i,1)*x2.get(i,1)\n",
" return dotProd\n",
"\n",
"def norm(x):\n",
" normVal = 0\n",
" idx = list(x.viewkeys())\n",
" for i in idx:\n",
" normVal += (x.get(i,1)*x.get(i,1))\n",
" return math.sqrt(normVal)\n",
"\n",
"def cosSim(x1,x2):\n",
" return dotprod(x1,x2)/norm(x1)/norm(x2)\n",
"\n",
"def cosSimHelper(list_1,list_2):\n",
" w1 = addonRarity(addon_frequency(list_1))\n",
" w2 = addonRarity(addon_frequency(list_2))\n",
" return cosSim(w1,w2) \n",
"\n",
"def computeCosSim(pair_record):\n",
" #requires the corpusIDFs to be broadcast!\n",
" source_1 = pair_record[0]\n",
" source_2 = pair_record[1]\n",
" ping_id_1 = source_1[0]\n",
" ping_id_2 = source_2[0]\n",
" value_1 = source_1[1]\n",
" value_2 = source_2[1]\n",
" cs = cosSimHelper(value_1,value_2)\n",
" return ((ping_id_1, ping_id_2),cs)\n",
"\n",
"def generate_nonCartesian_pairWiseComparison_RDD(rddq):\n",
" rdd1,rdd2 = rddq.randomSplit([0.5,0.5], RAND_SEED)\n",
" rdd1b = rdd1.zipWithIndex().map(lambda p: (p[1],p[0]))\n",
" rdd2b = rdd2.zipWithIndex().map(lambda p: (p[1],p[0]))\n",
" rddc = rdd1b.join(rdd2b).map(lambda p: p[1])\n",
" return rddc\n",
"\n",
"def generate_nonCartesian_pairWiseComparison_RDD_fast(rddq):\n",
" rdd1,rdd2 = rddq.randomSplit([0.5,0.5], RAND_SEED)\n",
"# rdd1b = rdd1.zipWithIndex().map(lambda p: (p[1],p[0]))\n",
"# rdd2b = rdd2.zipWithIndex().map(lambda p: (p[1],p[0]))\n",
" rddc = rdd1.zip(rdd2)\n",
" return rddc\n",
"\n",
"def safe_unicode(obj, *args):\n",
" try:\n",
" return str(obj, *args)\n",
" except UnicodeEncodeError: \n",
" return unicode('ascii_text_fail_sauce')\n",
"\n",
"def catVarRarity(addons_freq): \n",
" # corpusIDFsBroadcast MUST be a broadcast variable or everything breaks down!\n",
" # it must be a dict of {addon_name, frequency})\n",
" addon_dict_intf = dict()\n",
" \n",
" t = float(1)/len(addons_freq)#if the possibility of duplicates exists then: <float(1)> becomes <float(addons_frequency.count(i))/len(addons_frequency) and goes inside the loop\n",
" for i in addons_freq:\n",
" addon_dict_intf[i] = t*corpusCatIDFsBroadcast.get(i,1)\n",
" return addon_dict_intf\n",
"\n",
"def tokenize_catVar(p):\n",
" names = p.asDict()\n",
" del names['client_id']\n",
" cat_list = []\n",
" for i in names:\n",
" staging_string = safe_unicode(i) + '_' + safe_unicode(names[i])\n",
" cat_list.append(staging_string)\n",
" return (p[0], cat_list)"
]
},
{
"cell_type": "code",
"execution_count": 110,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"59800\n",
"CPU times: user 676 ms, sys: 128 ms, total: 804 ms\n",
"Wall time: 7.11 s\n"
]
}
],
"source": [
"%%time\n",
"crossSmall2 = generate_nonCartesian_pairWiseComparison_RDD(subset_addon_tok_filtered_rdd)\n",
"print crossSmall2.count()"
]
},
{
"cell_type": "code",
"execution_count": 115,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 504 ms, sys: 0 ns, total: 504 ms\n",
"Wall time: 784 ms\n"
]
}
],
"source": [
"%%time\n",
"sims = crossSmall2.map(lambda p: computeCosSim(p)).cache()\n",
"sims.take(1)"
]
},
{
"cell_type": "code",
"execution_count": 125,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"<matplotlib.text.Text at 0x7fd336303210>"
]
},
"execution_count": 125,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAe8AAAEZCAYAAAC+Qq3JAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzt3Xe4HVW5x/Hvj4TeEalCBJQqVZoUOYAKiBRBEYOIqIgX\nbFflAhaCiIINURBRBKQjVYoICBKK0jsSMWgglBA6oQkhee8fa22Y7Ox9zj4nJ3uycn6f5znPmT1l\nzTsza+adPooIzMzMrBxz1B2AmZmZ9Y+Tt5mZWWGcvM3MzArj5G1mZlYYJ28zM7PCOHmbmZkVpt/J\nW9LJkg7LzZtJGjNYwUi6TNKeuXkvSdcPYtkjJV0+WOVVyh3UeVDHeCW9KOmdg1FWB+P6taRvD3DY\ngyX9NjePkDRV0oB2QAd5/i0naZIkDUZ5fYxrlKTTeuk+TtJWMzuOUkg6XNJTkh6vO5YSzKz6I2kT\nSf/K68mO3Rpvi/EM6rYub4NWzM0D3rY1ldnRtm34jIwkIm4AVusgmFHAShHx6T7K+3Bzq4HEJWkE\nMA4YHhFTc9lnAmcOpLzedDoPZuXxRsSCg1FOh+P6nxkY9ojmVjNQ1jTzT9I44HMR8dcBlPUIsNBA\nYxkAv5yhA5KWA74OLBcRzwxSmQ8BSwBv5FZ/j4htc7e1SNuYJYAjIuLnuf1w4AZg14h4bDDiKNBh\nwC8j4tg6g6hu6ySdDDwSEYfMSJGVsge8beut3HZmKHkPJkmKwXtjjEgTP9OPhGZFkoZFxJS645hV\nef4MGSOApweSuHupIwFsHxHXtOh2BGln4V7gXklnRMSTud15QzhxQ1oW99cdxExQX46JiF7/gHWB\n24EXgLOBs4DDcrctSHsujX4PBB4FJgFjgC2BbYDX8t+LwJ2532uAw0l7pC8DK+Z2n83d98rdjgGe\nJy34rSrjGtf0exRwam5+GJiSxzcJ2CiXd32l/02AW4DngJuB91W6XUPaU7whD385sFib+dM8D8YB\n3wDuzmWfBczVZti+pvEzud0k4EHgC32M9//yeF8FPg9cXOk+FvhD5fd4YK3cPBVYMTd/GPhHHucj\nwNcrw3wEuDNP1w3Amr3Um58DE3O9uRtYPbc/ubn+AAfkfh8DdgK2Ax4AngYOblrGp+XmEXkZz9Hp\nvMrzZwJwSnX+Aafmsl7Jw38TuBTYv2ma7gZ2ajGtI/I8bMTSn/qzCHAJ8CTwTG5eptL9ncDoPB+v\nyHXl1Er3PYGHgKeAb1FZL4C5gKPzfH00L5M5m+bJ1yvz/jOVctvWg6b4V8rxPZ+n4axKt1WBK/N0\njQE+Xul2MnBsns+TgBuBFTqoP3MBPyWt4xOA44C5W8S1dV6eb+TyT8rtdwTuA54F/gqs2ss6NEeL\ncqfZ7jR1u78yf28E1s914yZgWB/b2bmB00h1vrFNens/6nZ/1qFzSdvyScBt5O1A8/SREtNBeZxP\n5WEW6WUa9iFtZ54G/ggslds/mJdDY/2as7f5yozV28VI69ALeR5+n2m3+1NJuWYf4HXgvzmmi5q3\nhc3bq/z7AODxHNfepO3Gis39dhDnh4E7cpwPA6Oatidvbtvazu8+KtScpA3DV4BhwK55gqsBjs/N\nK5MSwpL59/LklZFKYq2UfU0ue1XStffhTJ+8J1fGvRtpA7FIq5WIaZN3Y+JV6b4XcF1uXpS08o7M\n4949/160EttY0oZp7vz7h23m0ZvzoBLXTcCSpA3z/VRWtqZh+5rG7YB35ubNSTs56/Qy3juAZXLM\nKwDP5m5L53ndWFYrAs9Uhq1WwMeBTXLzwpXxrUuqhOuTVuo98zhbrYgfAm4FFsy/V6nUi+YKPhn4\ndp7+z5MSwOnAfMDqpBV+RC/LeI4O59Vk4IekOj13m/m3ZeX3x4GbKr/XJm3AhreY3uZY+lN/FgM+\nmvubH/gDcGGl+9+Bn+S4NydtaBrzYHXSDuqmufvPSOtnYyN4WB7+bfnvb8D3mubJqDzvt8vzbOHe\n6kGL+M8kJwfSRrcxzHyk7cGnc31pzL9VK/XgKeC9pHXwdODMDurPz0mJYeE8vy4CftDhurky8BKw\nVZ7mA/JyGt5qHWpT5jjSTsNE0k5ZNfH9AdgeeEeef4sCFwKb9badzcN+IU/L3Hl+rQss0I+63Z91\n6DVSnRtGOtD4D3nngmmT6Fdz/VmaVL9+3VhGLeLfKi/PtXO/vwSubbd+tZmvg1FvzybVyblJl8XG\nk7f7LbZ10yTm5u4ttlfb5mW/GjAvcEa78jqI8/3AGrn5PbncHVttT9r99XWzz8akiv3LiJgSEeeT\nVqpWppBW3vdIGh4R4yNiXB/l/z4i/hkRUyPijRbdJ1bGfQ5pT3L7PsqsandKY3vgXxFxZh732cA/\ngR0q/ZwcEf+OiNeAc4B1+jHeX0TExIh4nrQX2NuwbacxIv4cEQ/l5utJRzGb9zHexyPitTzvX5S0\nDqmiXAE8Lmnl/Lt6M2B1Pr0OrCFpwYh4ISLuyu33AY6PiNsiOY20Edi4RRyTgQWB1fPlkAciYmKb\nmF8nJbYppBVvceDoiHglIu4n7fys3cs0Ax3NqymkvdvJeZm2Up0PFwPvlrRS/v0p0pmLVvW0lY7q\nT0Q8GxEX5mX2MunU6/sBJC1P2lk6JMd9Pak+NewKXBIRf4uIycB3mfZa2UjSRu+ZSKeOv0fa6Wp4\nHfh+rnt/JiW2VSrdWtWDZpOBEZKWjYjXI+Lvuf1HgHERcWquL3cD55N2ihoujIjbI92XckZlHvVW\nf/YB/jfH9DJwJPDJNrE12w24NCL+muvbT0kb4U0q/by5DrUpYyTpbMgI0hmHKyQ17nc4ANiPtHPx\nNWAz0s7Ww5L+KOkaSR9rU+5kUqJaOc+vOyPiJeiobvd3Hbo917kpwFHAPLRej/cFvh0RE3L9Ogz4\nWJsbqUYCJ0bE3bnfg4H35Trc0Okp5gHV2xzXLqT15bWIGEM6y1bVVwy9df84ab0eExGvAof2UVbb\n9SsirouIf+Tm+0jLbYs+yptGX8l7GdLhftXDrXqMiH+TKuyhwERJZ0paqo/yH+mje6txL9PHMJ1Y\nhumn42Fg2crvJyrNrwAL9KP8aqLqa9i20yhpO0k3SnpG0nOkvbfFeynr0abf15IuXbyftKEZDfSQ\nKsm1bcrYlbTz8HDe2DRW6hHANyQ9m/+eIx1hTLc8Il0PPBb4FakuHC+p3Tx4JvLuJulUJaQjByrt\n+pz3Hcyrp/JGpSN54/0H4FP5LvJPkk5rdqqj+iNpXkm/kfSQpOdJy2WRPM6lgefyhqKhWm+XobIO\nRcQrpFPU1e7jm4atLq9ncuJsFWe7etDsANJ25BZJ90raO7cfAWzcVF9Gks5INbScR+3qj6S3k44m\nb2+UC/yZlPQ6Mc16n+vdI0y73jevQ9OIiBtzYvhvRBxJOlO2ee42PiK2j4j1STtZh5EuwfyUdPls\nR+AoSYu0KPpU0g722ZIelXSkpGHQUd3u7zpUrTORp7nVdnUEcGFlXt9P2slYskW/zfP2ZVJdXLZF\nv30ZaL19O+kot7oM+8ox/Y2rWt7D9J7s265fkjaS9FdJT+b1fl9637ZPp6/kPYHpZ/7yrXoEiIiz\nI2Jz0kIH+FGjU7tB+hh/q3E3Hvl4mbQiN1R3FPoq93HS3nNz2XXcUNJyGiXNBZwH/Jh07WtR0oaq\nt8rSPN3XkZL1ZqSkcB0pcb+fNsk7HwntTFoRLiIdNUKqtD+IiMXy36IRsUBE/KFNOcfmjdjqpL3N\nA3qJe4Z0OK/6qhOtup9KOuLeGng5Im4ehHCbfQN4N7BBRCxCPuomxT4BWFTSvJX+q+vfBGC5xg9J\n8zFtInuct9ZFcnNHj0z1Ug+a+3syIr4QEcsCXwSOy4/OPAKMbqovC0XElzocf6v68zRpA7hGpdxF\nImLhTspk+vkBaf5VN/Z91ZPpQqX1OnkI8NuIeApYk3S0+2Ie17umKyQdnX0/ItYgnQnYAfj0ALcD\nfanWGZF2wltt+8YD2zUtw/kjYkKLfqeZt5LmJ9XFXneG2hhovX2KdG39HZV2y7XpF1ov61don1em\nWd9yXP2tLw1nkM7QLJvX+9/Qz2XaV/K+EXhD0pclDZe0C7Bhqx4lrSxpy1zZXift7TX2OiYC78wV\npT+WrIz746Tr45flbncBu+du6wPV01FP5XGvRGuXkU6J7i5pmKRPkK5jXNKm/5lpiRbT+CfSJYi5\nSHfLTpW0HelaYH80jrznjYjHSafKtyWtVHc29yxpTqXn4RfKp9ReJJ1uBjgB+KKkDXO/80v6cF5J\nm8tZX9KG+RGZV0k3hUxt7m8QNOrTYMyrJ0j3ArwpIm4ixf0z+j7qHujGdEHSPJokaTEqp+IiYjzp\nhqLv5WWzGdNe2jkP+IjSM7Rzko70qnGcBXxH0uKSFiedVu/z7EEf9aC5349JauyAPk+aX1NJN6Kt\nLOlTuW7PmevFKq3KaSqzZf3JR4knAEfno3AkLSup02V9DrB93k4Nl/TNXPaNnQys9Dz/Jnla5pZ0\nAG9dk632tzppJ/n43Oo/wFaSliQl7upRZWOYHknvUTr1+xLpCLdxKXJG63az90raOR/Z/y9pHrTa\nMf0N8EPlU9+S3q4Wz2hnZwF7S1pL0tyk+0tuivQYZX8NqN7mo9wLgEPzGa1VSfdctDORpnWetF0c\nKWkOSdsy7ansc4DPSFot7yjPyCNmC5DOqk3O29SRTd373J70mrzzacZdSHfVPUM6539+m97nJl1/\neoq0l/R20nUPSHc3CnhG0m2N4luNsun3TaSjkqdJdw3uGhHP5W7fJa0Iz5JuCjijEverwA+Av+VT\nPtPscETEs6Rrct/MZX+T9PhHo+yB7k0NZNibmX4an8/Xu74CnJtPWe1OOgLqeLwRMZa04b0u/34R\n+DdwQ+U0W/OwewLj8qmcL5ArVUTcTrreeGyO51+kG+5aWYi0kX2WdCPK06SbrjrRPB29zc/IsfV3\nXrVyJPDdXF++Xml/KumGktP7GL7d/OzL0aQ9/adJN+lc1tR9JOl65DOkOv/mNbx8PXN/0sbu8dxP\n9UjncFLyv4d0B/VtpPWik2loWQ9a2AC4WdIk0pHEVyLiobxMPkRaFo/nvyNJ24m+9FZ/DiTdvXxT\nju1K0o1ofYqIf5HOpBxL2k5tD+wQb93H0NdyW5B009azpPn8IWDbynaj4VjSfGiU9y3SzV/3ks5e\nPcn0liLtjL1Ausv/GuD0Adbtvtahi4BPkO5q3wPYJd56LK7a7y9yv1dKeoFUP1sevEXE1aT6eQHp\nKH6FHGu7GHqLeUbq7ZdJNwo3nio5k3RvTqt+TyTd1/GspAtyu6+RLm88R7pUduGbA0ZcTlpf/0ra\n/l3dxzT1Fud+wPfzfP0O6RJdu35b0rTb8NmXpHeQNsRLko4MToiIX0palDTjRpDuyN4tIl7IwxwM\nfJZ0KuarEXHlIMe0F+nFIO/vs2erhdIb//bxMrLZgTp8YdbsQtKRpCcV9u6z58IMpXebv0F6VnUN\n4H3A/vm0ykHAVRGxCmmP6mB489TXbqTT6duRruUNyZe+DFX51Nh+pNOHZjaLk7SKpDVz84bA50hn\nA2Y7QyZ5R8QTkR93yaeixpBubNiJt05FngLsnJt3BM6OiDciPaYxljanjGz2k6+jPkk6/XZWzeGY\nWWcWBC6Q9BJpvf1JRNRxL9NMN2ROm1cpvZh+NOla5iP5Ds5Gt2cjYjFJxwA3RnonOpJ+B1wWEbPl\nXpyZmZVjyBx5Nyg9b3we6Rr2S/TvBikzM7PazTIfJumG/OjJeaT3Yzfu2JwoacmImKj0UpnGnaCP\nMe0zfS2fhZTkZG9mNgAR4fuIBmioHXmfBNwfEb+otLuY9OJ/SI8+XVRpv7ukuSStQHos7ZZWhUYf\n7y0u+W/UqFG1x+Dp8/QNxembnactwsc8M2rIHHlL2pT0TOO9ku4knR7/FuktcOdI+izpdXe7QXqG\nVtI5vPVKwP3CNc7MzGYBQyZ5R8TfSO+9beUDbYY5gvShCDMzs1nGUDttbv3U09NTdwgzlaevbLPz\n9M3O02Yzbkg+KjaYJPlsuplZP0kifMPagPnI28zMrDBO3mZmZoVx8jYzMyuMk/cguOGGG/zcopmZ\ndY2T9yDY91v7cs8999QdhpmZDRFO3oNgrrfNxdSpU+sOw8zMhggnbzMzs8I4eZuZmRXGydvMzKww\nTt5mZmaFcfI2MzMrjJO3mZlZYZy8zczMCuPkbWZmVhgnbzMzs8I4eZuZmRXGydvMzKwwTt5mZmaF\ncfI2MzMrjJO3mZlZYZy8zczMCuPkbWZmVhgnbzMzs8I4eZuZmRXGydvMzKwwTt5mZmaFcfI2MzMr\njJO3mZlZYZy8zczMCuPkbWZmVhgnbzMzs8I4eZuZmRXGydvMzKwwTt5mZmaFcfI2MzMrjJO3mZlZ\nYZy8zczMCuPkbWZmVhgnbzMzs8I4eZuZmRXGydvMzKwwTt5mZmaFcfI2MzMrjJO3mZlZYZy8zczM\nCjNkkrekEyVNlHRPpd0oSY9KuiP/bVvpdrCksZLGSPpQPVGbmZlNb8gkb+BkYJsW7Y+KiPXy3+UA\nklYDdgNWA7YDjpOk7oVqZmbW3vC6A+gPSXMAawPLAK8C90XEk50MGxE3SBrRqtgW7XYCzo6IN4CH\nJI0FNgRuHljkZmZmg6eI5C1pJeBA4APAWOApYB5gZUmvAL8BTomIqQMo/kuS9gRuA74RES8AywI3\nVvp5LLczMzOrXRHJGzgc+DWwb0REtYOkJYCRwJ7AKf0s9zjgsIgISYcDPwM+39/gJoyZwPHHH8/S\nSy9NT08PPT09/S3CzGy2Nnr0aEaPHl13GLMNNeXC2Vo+bX5JRKzVWzdJBwERET/K3S4HRkXEdKfN\nJcU6O6/DSYecxLrrrjuzJ8HMbLYgiYjwvUQDVMSRt6RdeuseERd0WhSVa9ySloqIJ/LPXYD7cvPF\nwBmSfk46Xf4u4JZ+BW1mZjaTFJG8gR3y/yWATYC/5t9bAn8H+kzeks4EeoC3SRoPjAK2lLQOMBV4\nCNgXICLul3QOcD8wGdiv+XS9mZlZXYpI3hGxN4CkK4HVI2JC/r008PsOyxjZovXJvfR/BHBEv4M1\nMzObyUp7znu5RuLOJgLL1xWMmZlZHYo48q64WtIVwFn59yeAq2qMx8zMrOuKSt4R8SVJHwXen1v9\nNiIurDMmMzOzbisqeWd3AC9GxFWS5pO0YES8WHdQZmZm3VLUNW9J+wDnkd6oBukxrj/WF5GZmVn3\nFZW8gf2BTYFJABExlvT4mJmZ2ZBRWvJ+LSJeb/yQNBzw89dmZjaklJa8r5X0LWBeSR8EzgUuqTkm\nMzOzrioteR9E+qLYvaS3oV0WEd+uNyQzM7PuKu1u8z1I39k+odFC0kci4tIaYzIzM+uq0o68jwGu\nl7Rapd1hdQVjZmZWh9KS9zjgs8B5kj6e2/mTcmZmNqSUdto8IuIOSVsAZ0naCBhWd1BmZmbdVNqR\n9wSAiHga2Ib0mNh7ao3IzMysy4pK3hGxfaV5akQcEBFFTYOZmdmMKuK0uaSjI+Jrki6hxUtZImLH\nGsIyMzOrRRHJGzgt//9prVGYmZnNAopI3hFxe/5/bd2xmJmZ1a2I5C3pXnp5h3lErNXFcMzMzGpV\nRPIGPlJ3AGZmZrOKIpJ3RDxcdwxmZmaziqIes5K0saRbJb0k6XVJUyRNqjsuMzOzbioqeQPHAp8E\nxgLzAp8HflVrRGZmZl1WWvImIh4EhkXElIg4Gdi27pjMzMy6qYhr3hWvSJoLuEvSj0mvSy1uB8TM\nzGxGlJb49iR9iORLwMvAcsCutUZkZmbWZUUdeVfuOn8V+F6dsZiZmdWlqCNvSR+RdKekZyVNkvSi\n7zY3M7Ohpqgjb+BoYBfg3oho+8Y1MzOz2VlRR97AI8B9TtxmZjaUlXbk/X/AZZKuBV5rtIyIo+oL\nyczMrLtKS94/AF4C5gHmqjkWMzOzWpSWvJeJiPfUHYSZmVmdSrvmfZmkD9UdhJmZWZ1KS97/A1wu\n6VU/KmZmZkNVMafNJQlYIyLG1x2LmZlZnYo58s6Ph/2p7jjMzMzqVkzyzu6QtEHdQZiZmdWpmNPm\n2UbAHpIeJn2YRKSD8rXqDcvMzKx7Skve29QdgJmZWd2KOm2evyq2CLBD/luk8qUxMzOzIaGo5C3p\nq8AZwBL573RJX643KjMzs+4q7bT554CNIuJlAEk/Am4Ejqk1KjMzsy4q6sibdIPalMrvKbmdmZnZ\nkFHakffJwM2SLsy/dwZOrDEeMzOzrisqeUfEUflzoJvmVntHxJ11xmRmZtZtRSXv7C5gAjl2Scv7\nlalmZjaUFHXNO99ZPhH4C3Ap6XWpl3Y47ImSJkq6p9JuUUlXSnpA0hWSFq50O1jSWElj/CUzMzOb\nlRSVvIGvAqtExBoRsVZErNmPt6udzPQveTkIuCoiVgH+ChwMIGl1YDdgNWA74Lj8YRQzM7PalZa8\nHwFeGMiAEXED8FxT652AU3LzKaQb4AB2BM6OiDci4iFgLLDhQMZrZmY22Eq75v0fYLSkPwGvNVpG\nxFEDLG+JiJiYy3hC0hK5/bKk58cbHsvtzMzMalda8h6f/+bKf4MtBjLQhDETOP7441l66aXp6emh\np6dnkMMyMyvb6NGjGT16dN1hzDaUPpM9NEgaAVzSuE4uaQzQExETJS0FXBMRq0k6iPS1sh/l/i4H\nRkXEzS3KjHV2XoeTDjmJddddt4tTY2ZWLklEhO8lGqAirnlLOkHSmm26zS/ps5L26KQopn0j28XA\nZ3LzXsBFlfa7S5pL0grAu4BbBhS8mZnZICvltPmvgO/mBH4f8BQwD/BuYCHgJNIHS9qSdCbQA7xN\n0nhgFHAkcK6kzwIPk+4wJyLul3QOcD8wGdgvhtIpCjMzm6UVkbwj4i5gN0kLAOsDSwOvAmMi4oEO\nyxjZptMH2vR/BHDEAMI1MzObqYpI3g0R8RIwuu44zMzM6lTENW8zMzN7i5O3mZlZYYpK3u3uODcz\nMxtKikrepHeM3yJpv+pHRMzMzIaSopJ3RGwO7AEsB9wu6UxJH6w5LDMzs64qKnkDRMRY4DvAgcAW\nwC8l/VPSLvVGZmZm1h1FJW9Ja0n6OTAG2ArYISJWy80/rzU4MzOzLinqOW/gGOB3wLci4tVGy4h4\nXNJ36gvLzMyse0pL3tsDr0bEFABJcwDzRMQrEXFavaGZmZl1R1GnzYGrgHkrv+fL7czMzIaM0pL3\nPPkVqcCbr0udr8Z4zMzMuq605P2ypPUaPyS9l/SBEjMzsyGjtGveXyN9wvNx0ne5lwI+UW9IZmZm\n3VVU8o6IWyWtCqySWz0QEZPrjMnMzKzbikre2QbAO0mxryeJiDi13pDMzMy6p6jkLek0YCXgLmBK\nbh2Ak7eZmQ0ZRSVvYH1g9YiIugMxMzOrS2l3m99HuknNzMxsyCrtyHtx4H5JtwCvNVpGxI71hWRm\nZtZdpSXvQ+sOwMzMrG5FJe+IuFbSCODdEXGVpPmAYXXHZWZm1k1FXfOWtA9wHvCb3GpZ4I/1RWRm\nZtZ9RSVvYH9gU2ASQESMBZaoNSIzM7MuKy15vxYRrzd+SBpOes7bzMxsyCgteV8r6VvAvJI+CJwL\nXFJzTGZmZl1VWvI+CHgKuBfYF7gM+E6tEZmZmXVZaXebTwVOyH9mZmZDUlHJW9I4WlzjjogVawjH\nzMysFkUlb9K7zRvmAT4OLFZTLGZmZrUo6pp3RDxT+XssIo4Gtq87LjMzs24q6shb0nqVn3OQjsSL\nmgYzM7MZVVri+1ml+Q3gIWC3ekIxMzOrR1HJOyK2rDsGMzOzuhWVvCV9vbfuEXFUt2IxMzOrS1HJ\nm3SNewPg4vx7B+AWYGxtEZmZmXVZacn7HcB6EfEigKRDgT9FxKdqjcrMzKyLinpUDFgSeL3y+/Xc\nzszMbMgo7cj7VOAWSRfm3zsDp9QYj5mZWdcVlbwj4geS/gxsnlvtHRF31hmTmZlZt5V22hxgPmBS\nRPwCeFTSCnUHZGZm1k1FJW9Jo4ADgYNzqzmB0+uLyMzMrPuKSt7AR4EdgZcBIuJxYMFaIzIzM+uy\n0pL36xER5M+CSpq/5njMzMy6rrTkfY6k3wCLSNoHuAo4oeaYzMzMuqq0u81/KumDwCRgFeCQiPhL\nzWGZmZl1VTHJW9Iw4Kr8cRInbDMzG7KKSd4RMUXSVEkLR8QLg1m2pIeAF4CpwOSI2FDSosAfgBHk\nT48O9njNzMwGopjknb0E3CvpL+Q7zgEi4iszWO5UoCcinqu0O4h0pP9jSY3H0w6awfGYmZnNsNKS\n9wX5b7CJ6W/e2wnYIjefAozGydvMzGYBRSRvSctHxPiImFnvMQ/gL5KmAL+JiN8BS0bERICIeELS\nEjNp3GZmZv1SRPIG/gisByDp/IjYdZDL3zQiJkh6O3ClpAfIz5JXNP9+04QxEzj++ONZeuml6enp\noaenZ5DDMzMr2+jRoxk9enTdYcw2lN55MmuTdGdErNvcPJPGNYp0bf3zpOvgEyUtBVwTEau16D/W\n2XkdTjrkJNZdd6aFZWY2W5FERKjuOEpVyktaok3zDJM0n6QFcvP8wIeAe4GLgc/k3vYCLhrM8ZqZ\nmQ1UKafN15Y0iXRj2by5mfw7ImKhGSh7SeBCSUGaH2dExJWSbiO90e2zwMPAbjMwDjMzs0FTRPKO\niGEzsexxwDot2j8LfGBmjdfMzGygSjltbmZmZpmTt5mZWWGcvM3MzArj5G1mZlYYJ28zM7PCOHmb\nmZkVxsnbzMysME7eZmZmhXHyNjMzK4yTt5mZWWGcvM3MzArj5G1mZlYYJ28zM7PCOHmbmZkVxsnb\nzMysME7eZmZmhXHyNjMzK4yTt5mZWWGcvM3MzArj5G1mZlYYJ28zM7PCOHmbmZkVxsnbzMysME7e\nZmZmhXHyNjMzK4yTt5mZWWGcvM3MzArj5G1mZlYYJ28zM7PCOHmbmZkVxsnbzMysME7eZmZmhXHy\nNjMzK4yTt5mZWWGcvM3MzArj5G1mZlYYJ28zM7PCOHmbmZkVxsnbzMysME7eZmZmhXHyNjMzK4yT\nt5mZWWFX+OQOAAALSElEQVScvM3MzArj5G1mZlYYJ28zM7PCOHmbmZkVxsm7D5K2lfRPSf+SdGDd\n8ZiZmTl590LSHMCxwDbAGsAnJa1ab1TdNXr06LpDmKk8fWWbXadvm523YYPNNqg7DJuFOXn3bkNg\nbEQ8HBGTgbOBnWqOqatm141jg6evbLPr9I1/bDzj/jOu7jBsFubk3btlgUcqvx/N7czMzGrj5D0I\nXn/qdfb/5v4sPmJxVttgNRYfsTiLj1icbXbepu7QzMxsNqSIqDuGWZakjYFDI2Lb/PsgICLiR5V+\nPAPNzAYgIlR3DKVy8u6FpGHAA8DWwATgFuCTETGm1sDMzGxIG153ALOyiJgi6UvAlaRLDCc6cZuZ\nWd185G1mZlYY37DWoU5e1iLpl5LGSrpL0jrdjnFG9DV9kkZKujv/3SBpzTriHKhOX7YjaQNJkyXt\n0s34ZkSHdbNH0p2S7pN0TbdjnBEd1M2FJF2c17t7JX2mhjAHRNKJkiZKuqeXfkrervQ6faVvV2oV\nEf7r44+0k/MgMAKYE7gLWLWpn+2AP+XmjYCb6o57kKdvY2Dh3Lzt7DZ9lf6uBi4Fdqk77kFcdgsD\n/wCWzb8XrzvuQZ6+g4EjGtMGPAMMrzv2DqdvM2Ad4J423YvdrnQ4fcVuV+r+85F3Zzp5WctOwKkA\nEXEzsLCkJbsb5oD1OX0RcVNEvJB/3kRZz7t3+rKdLwPnAU92M7gZ1Mm0jQTOj4jHACLi6S7HOCM6\nmb4AFszNCwLPRMQbXYxxwCLiBuC5XnopebvS5/QVvl2plZN3Zzp5WUtzP4+16GdW1d+X0Xwe+PNM\njWhw9Tl9kpYBdo6IXwMlPb7SybJbGVhM0jWSbpW0Z9eim3GdTN+xwOqSHgfuBr7apdi6oeTtSn+V\ntl2ple82t36RtCWwN+l02OzkaKB6PbWkBN6X4cB6wFbA/MCNkm6MiAfrDWvQbAPcGRFbSVoJ+Iuk\ntSLipboDs87MxtuVmcbJuzOPActXfr8jt2vuZ7k++plVdTJ9SFoL+C2wbUT0dqpvVtPJ9K0PnC1J\npOum20maHBEXdynGgepk2h4Fno6I/wL/lXQdsDbpWvKsrpPp2xs4AiAi/i1pHLAqcFtXIpy5St6u\ndKTg7UqtfNq8M7cC75I0QtJcwO5A80b9YuDT8Oab2Z6PiIndDXPA+pw+ScsD5wN7RsS/a4hxRvQ5\nfRGxYv5bgXTde78CEjd0VjcvAjaTNEzSfKQbn0p5X0En0/cw8AGAfD14ZeA/XY1yxoj2Z3pK3q40\ntJ2+wrcrtfKRdweizctaJO2bOsdvI+IySR+W9CDwMulooAidTB/wXWAx4Lh8dDo5IjasL+rOdTh9\n0wzS9SAHqMO6+U9JVwD3AFOA30bE/TWG3bEOl93hwO8rjyP9X0Q8W1PI/SLpTKAHeJuk8cAoYC5m\ng+0K9D19FLxdqZtf0mJmZlYYnzY3MzMrjJO3mZlZYZy8zczMCuPkbWZmVhgnbzMzs8I4eZuZmRXG\nydtmWZK+J2mrAQ67r6RPDXZMuexLJS3Uj/53kPR/uXmUpK/3c3zV4XeStGo/hx/wfOyj3JNbfTpV\n0haSLhns8VXKXyV/3vR2SSsMYPivSppnZsRm1i1+ztuKI2mOiJhadxwDIWkU8GJEHNVh/8MiYkrl\n98nApRFx/syKsVM5lksi4oKm9lsA34iIHWfSeA8EhkXEDwc4/Djgvf15kUvzcjCrm4+8rSvy6y3H\nSDpd0v2Szmkc/Uj6rqSbJd0j6fjKMG8e2UkaJ+lISbcBe+T/SFpb0lRJ78i/H5Q0T/UIV9JXJP1D\n0l35jU9Imk/SiZJuykdwO7SIeSlJ10q6I8e2aSWWxSrTdLKkB/K0bS3phvx7/dz/XpKOaVH+5yXd\nko8iz63Mj5Ml/VrSjcCPGsNLeh+wI/DjHNOKkm6vlPeu6u9e5uOheZrvlrRym2V1naTb8t/GlW7H\n5mm+Elii0n7b3P42YJdK+0UlXZjH9XdJ78ntR+X5f01eZl+uLJdL8zy5R9LHm2LbDvga8D+Srs7t\n9sj1544835TbH5fn7715p4k8nmWAayrDv1gpf9e8U1JdDjfl5dCyzkhavTL+u5Q+jmI2Uzl5Wzet\nAhwbEasDLwL75fbHRMRGEbEWMJ+k7dsM/3RErB8RpwFzS1qA9BWiW4HNld6TPDF/gKPqQGCdiFgH\n+GJu923g6ojYmPS1rZ9KmrdpuJHA5RGxHulDHnfl9tXTVSsBP4mIVUgfw/hkRGwGHJDHQYthGs6P\niA0jYl3gn8DnKt2WjYj3RcQ3G8NHxI2kd10fEBHrRcR/gOeVPuwA6dWZJ7UYT7MnI+K9wPE5zmYT\ngQ9ExPqkd4kfA5B3AN4dEasBewGb5PZzkz4ssX0eZqlKWd8D7oiItfP8OK3SbRXgg6R3rY+SNAzY\nFngsItbN9eHyamAR8ecc988jYmulSwifADbJy2kqsEfu/Vv5VZtrAz2S3hMRx5A+7NETEVs3im2a\n/urvZSNi47wc2tWZLwJH5/GvT/oQjNlM5eRt3TQ+Im7Kzafz1uf/ts5HM/cAWwJrtBn+D5Xmv+fh\n3w/8ENgC2By4vsVwdwNnStqD9G5vgA8BB0m6ExhNet/y8k3D3QrsLekQYK2IeDm3r35kYVzlPeH/\nAK7OzfcCI9pMR8Na+Qj3HtKOQnW6z+1j2IYTc4xzkJLYmR0Mc2H+f3ubGOcCfpfjOhdYLbffHDgL\nICIm8Na0rgr8J+9MQFq2DZuRE3ZEXEP6rvgCudufIuKNiHiGtMOwJGm+fVDSEZI2i4gX6d3WpM+d\n3pqX5VbAirnb7vlMxJ3A6vkPev8QSLPqcmhXZ24Evi3pAOCdEfFah2WbDZiTt9Up8lHbr4Bd8pHW\n74B2NxO9XGm+npRMlo+Ii0hHV5vSOnlvDxzLWxv5YaSN9675CG/diFghIh6YJriI60k7B4+RPnzR\n6ga46oZ6auX3VPr+8M/JpK+XrQUcxrTT/XLrQaZzPvBh4CPAbR1+UrER45Q2Mf4v8ESOa31SkmpF\nbZo71TzvhkfEWNJyuhc4XNJ3+ihDwCn5TMS6EbFaRBwm6Z3AN4At81H/ZbSvV1XN/TQvh+nqTESc\nBewA/Be4TFJPB+MxmyFO3tZNy0vaKDePBG4gbSwDeCYfkX2sw7KuBz4FjM2/nyUlsRtajTcirgUO\nAhYC5geuAL7S6EHSOs0D5dPwT0bEiaSdivValD2QpNWwAPCEpDl561RvX14kTQMA+SjvCuDXpJ2B\nwbAwMCE3fxoYlpuvAz4haQ5JS5POkkA65T9Cb935/clKWY3lRE5qT0fES+1GnMt9NSLOBH5C63le\ndTXwMUlvz8MvmpfbQsBLwItKnwndrjLMJCrzkLQMVslnLz7ay7ha1hlJK0TEuHxK/iJgrTbDmw0a\nJ2/rpgeA/SXdDywC/DoiXgBOIJ1y/jNwS6X/aNNMRDycG6/N/28gfev4hWp/koYDp0u6m3Sa+BcR\nMQn4PjBnvinqXtKRb7Me4G5JdwC7AUf3EVd/H904hDS91zPt97V7K+ds4ABN+5jUGaSj6CvbDNPf\nGI8DPpNPD69MPvqMiAuBB0nL6vekSxeNHYh9SUedt5FOgTccCrw3z/8fkr9N3UuMawK35HEfQvrc\nZ1sRMQb4DnBlHseVwFIRcQ/pHoUxpNP41Z26E4DLGzesAQcDf8r9PN4ipobDeavO3MdbdWY3Sffl\nmNcATu0tZrPB4EfFrCskjSA94rRm3bHMbiR9A1goIkbVHYuZdUdf1+TMBpP3FAeZpAtIN2gN+ktY\nzGzW5SNvMzOzwviat5mZWWGcvM3MzArj5G1mZlYYJ28zM7PCOHmbmZkVxsnbzMysMP8Pu7hyak22\nAwoAAAAASUVORK5CYII=\n",
"text/plain": [
"<matplotlib.figure.Figure at 0x7fd32a618610>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"vals_css = sims.collect() # maybe we need to subsample!\n",
"temp2 = []\n",
"for i in vals_css:\n",
" temp2.append(i[1]) #store it on the client machine (locally) as a list\n",
"n, bins, patches = plt.hist(temp2, 200, normed=1, facecolor='green', alpha=0.75)\n",
"plt.xlabel('pairwise similarity in addons features')\n",
"plt.ylabel('Frequency (normalized)')\n",
"plt.title('distribution in pairwise similarity in addons sense for 5% sample of longitudinal')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### it may be interesting to normalize these values by the number of add-ons in the comparison!"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# OK, lets try to relate the pairwise addon similarities to something else"
]
},
{
"cell_type": "code",
"execution_count": 126,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[Row(client_id=u'fffed907-8cc1-42b5-a5af-c306493f7751', geo_country=u'US', os=u'Windows_NT', sys_mem=8078, virt_mem=None, theme=u'{972ce4c6-7e08-4474-a285-3208198ce6fd}', flash=None, block_list_bool=True, e10_bool=False, telemetry_bool=False, default_search=u'Yahoo', loc=u'en-US', channel=u'release'),\n",
" Row(client_id=u'fff0dbc6-7dc9-4aba-b692-1050c9bb7f39', geo_country=u'CO', os=u'Windows_NT', sys_mem=1644, virt_mem=None, theme=u'{972ce4c6-7e08-4474-a285-3208198ce6fd}', flash=None, block_list_bool=True, e10_bool=False, telemetry_bool=False, default_search=u'Google', loc=u'es-ES', channel=u'release'),\n",
" Row(client_id=u'ffe4b75f-874b-429f-b3cf-cc4dc57eefb6', geo_country=u'CN', os=u'Windows_NT', sys_mem=3979, virt_mem=None, theme=u'{972ce4c6-7e08-4474-a285-3208198ce6fd}', flash=None, block_list_bool=True, e10_bool=False, telemetry_bool=False, default_search=u'\\u767e\\u5ea6', loc=u'zh-CN', channel=u'release')]"
]
},
"execution_count": 126,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"freshPings_cat_DF.take(3)\n",
"#subset_cat_rdd.take(10)"
]
},
{
"cell_type": "code",
"execution_count": 129,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"<pyspark.broadcast.Broadcast at 0x7fd32908d190>"
]
},
"execution_count": 129,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tokenized_catVars_corpus = freshPings_cat_DF.rdd.map(lambda p: tokenize_catVar(p)).filter(lambda p: exclude_empty_entities(p)).cache()\n",
"#cnt = tokenized_catVars_corpus.count()\n",
"corpusCatIDFs = population_frequency(tokenized_catVars_corpus) # idfs of the keys in the pair rdd\n",
"corpusCatIDFsBroadcast = corpusCatIDFs.collectAsMap() \n",
"sc.broadcast(corpusCatIDFsBroadcast)#broadcast the idfs (this should be small)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Because these are categorical variables with a (small) finite set of possible values, we can use them almost exactly the same way as we did with addon names. This we we can reuse a lot of the code."
]
},
{
"cell_type": "code",
"execution_count": 130,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[(u'ffa13c6b-e7c9-4be4-9cc7-d23a52a2a231', ['loc_en-US', 'sys_mem_8083', 'e10_bool_False', 'geo_country_BD', 'block_list_bool_True', 'flash_None', 'default_search_Search for Firefox Search Engine', 'telemetry_bool_True', 'theme_{972ce4c6-7e08-4474-a285-3208198ce6fd}', 'virt_mem_None', 'os_Windows_NT', 'channel_beta'])]\n"
]
}
],
"source": [
"tokenized_catVars = subset_cat_rdd.map(lambda p: tokenize_catVar(p))\n",
"print tokenized_catVars.take(1)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### then perform pairwise similarity comparisons and keep track of SS and SD compariosns in a list of id pairs"
]
},
{
"cell_type": "code",
"execution_count": 132,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[((u'9a6e9a40-ea84-4eee-baa7-494d3f814805', u'aa6e892a-4970-4b15-9dd8-816db3dc5617'), 0.0)]\n",
"[((u'58f72141-c246-4559-b3fb-3aaf981838d3', u'9fe5cd59-7656-401a-b349-9a894165e43c'), 0.0)]\n"
]
}
],
"source": [
"crossSmall3 = generate_nonCartesian_pairWiseComparison_RDD(subset_cat_rdd)\n",
"sims2 = crossSmall3.map(lambda p: computeCosSim(p)).cache()\n",
"print sims.take(1)\n",
"print sims2.take(1)\n"
]
},
{
"cell_type": "code",
"execution_count": 133,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"c = sims.map(lambda p: p[0]).intersection(sims2.map(lambda p: p[0])).count()"
]
},
{
"cell_type": "code",
"execution_count": 134,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"59800\n"
]
}
],
"source": [
"print c "
]
},
{
"cell_type": "code",
"execution_count": 135,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 540 ms, sys: 12 ms, total: 552 ms\n",
"Wall time: 700 ms\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAgoAAAEZCAYAAAD2aw39AAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzt3XecHWXZ//HPNwlIrwqRFooCgRCUHyWKwgIqIFJERQGR\noqKC5XmwAZbELuojFsRHyhMpAgKKgKJEylI0oQaIBBEEkhBglSIEREjI9fvjvk8yOZzZPdvO7O75\nvl+vfe30ueY+c2auc889M4oIzMzMzBoZVXUAZmZmNnQ5UTAzM7NSThTMzMyslBMFMzMzK+VEwczM\nzEo5UTAzM7NSvU4UJE2V9NXc/SZJ9wxUMJKukHRY7j5c0g0DuOxDJP1hoJZXWO6AlkEV65W0QNLG\nA7GsJtb1U0lf6OO8J0g6LXePk7RYUp+S3QEuvw0lPSNJA7G8oaL4XS8Zv1jSpq2MaSjL5fWkpBlV\nxzIcDNb+I+mdkubm7+S2rVpv3ToG9JhQf7wrniv7udxdJc3rabp+1ShExI0RMb6JYCZLOruJ5b09\nIs4pDupLXI1OIhFxXkTs1ZfldafZMhjK642IVSPioYFYVhPr+lhEfKOP834rIo4uDupHHMuUn6QH\nJe3ex2XNi4jVov0eStJu21tK0puAPYD1ImLSAC1zcU7iF+STzmmFcXtIekDSI5LeWxi+uqTbJK08\nEDEMssHaf74LHJO/k3e2cL1LV1B3TJB0raSj+rvYwvLrz5UDstwyYwZoRf0mSQN4oBVp40fUL7xm\nSRodES9VHcdQ5fIZMG35/SqxMfBQRPyntzN2sz8GMDEiHmww7mRgH2A54FpJF+bj57eAb0bEc72N\nowKDtf+MA2ZXsN6RKyK6/QNeD9wGPA1cAJwPfDWP2xWYV5j288DDwDPAPcBuwJ7AC/lvATAzT3st\n8HXgRuA5YNM87Kg8/vA87sfAv0gf/O6FdT1Y1z8ZODt3zwFeyut7BtgpL++GwvRvBG4GngJuAt5Q\nGHct8NW8/meAPwBrlZRPfRk8CHwauDMv+3xg+ZJ5e9rGI/KwZ4D7gaN7WO/n8nqfBz4EXFYYfx/w\ny0L/XNJBCGAxsGnufjtwd17nPOC4wjzvAGbm7boR2Kab/eZkoCvvN3cCW+XhU+v3H+Czedr5wP7A\n3sC9wOPACXWf8Tm5e1z+jEc1W1a5fB4FziqWH3B2Xta/8/yfAX4LHFu3TXcC+zfY1nG5DGux9Hr/\nAY4rlMERhfGr5fj+kT/jL9TtPzeQfkE9Cfwd2Kubz2RT4Opcrv8AzgVWa+a7nsd/FniE9B0/MpfZ\npv2NM392f89l9Xfg4JL4dwBuyfE9CnyvMG4S8CfSvjkT2LWZ7zPwCuCcXCa1Y8GrCtt0Rt7mecDX\nADWI6yjSd25hXv7kPPzDpO/d48BvgFcX5lkMHAP8Dfh7yfYuBjYrGXd/oftR4JXAjsAVTRzT1wYu\nz9v7BHBd3TH8/rwdfwEOaHC8+n6e937gDXn4XOAx4AOF6acCPwWm5eVdC2xUt321/Wd54HukY/ej\nwKnAK0riF/BF4KG8zp8Dq+ZlLCDtl88C93VTrgOx324MXEfaH6cBp7Ds8Wkxqdb+68Ailh5ffkTd\nMaOwn9bOf6Nyefwzl/MxLHu8qz9X9vT96u7YOLfHfaaHHWq5/GF8EhgNvAt4kWUP9HNz9+Z5Z1k3\n928EbBJLD/Bn1y372rzsLXOhjGmw8QsL6z6IdDJdI4/vLlGonURUGH84cH3uXjMX6CF53e/L/WsW\nYrsP2Ix0ILmWlKWXHejnFvofBGYA6wJr5A/o6JJ5e9rGvYGNc/ebSQnV67pZ7+3AejnmTYAn87hX\n57KufVabAk8U5i0e8B8B3pi7Vy+s7/WkE9n2pC/qYXmdyzXYrreRDuir5v4tCvtFfaKwEPhC3v4P\nsfQEthKwFenLNa6bz3hUk2W1EPgmaZ9+RUn57Vbofw8wo9C/LelLO6bB9tbH0tv9Z2HettF5O54D\nVs/jzwYuyeUxjpRAHVnYf14gnagEfBSY3833eTNS9fgY0smiE/h+k9/1vUgH8PHAisAv6vabPsWZ\np38aeE3uXxcYXxL/n4FDC/PtmLvXI52M98z9e+T+tXv6PICjgUvzcJH281XyuEtIJ6wVSCfiGcCH\nu/kuX1/o3z3vL9vmsv0Ry56QFwNXkr5jZSfExaSk7BHgYvL3oFAWE/PyH86f6Z8pSSzqlvvNvF2j\n8me9c2Hcu1j6XX0P6YS7bmEbXwQ+kMvqa6QT+4/zNr6VdDJaqfBdfxrYOY//Acv+WCuesE8mJVOr\nAyvnz+QbJfEfRUqwxuX94FcUzi95uZt0s/3F9fb5+5XL+6Rc9jvnbS07Pi05tzUaXz9NXtds0r69\nBnBN2fKaiLPp80hpmfWwQ70ZeLhu2J9onChsRsru9qDuYEp5ojClwbDixtev+yaWHigepOdEofgh\nLPkiA++ncBIofOgfKMRxYmHcxyjJ1OsLOsd1cKH/JODUbg4updvYYPpLgE90s97D66afA7wOeC/w\nM9KBbnNShvmbki/OQ6RfQqvWLetU4Ct1w/4KvLlBnLvlcTtR9wuMlycKz9WmAVbJsWxfmP5WYL9m\nPuMeyuo/FJKakvIr7k+vIP3a2iz3fxc4pWRdjQ4Kvdl/nqvbV7tIvw5HkQ4AWxTGHQ1cU9h//lYY\nt2KOY53uvteF6fcHbsvduzTYF4vf9TMpJDvAa2v7TR/jXAysQzpAPwm8E1ihh3g78z6wdt3wzwFn\n1Q37A3BYT58HqWbkZbVjObb/UDiJk35QXNPNd7mYKJwBfLvQvzLpJLtR4Tu3aw/b+ybSSWg10sl4\nVmEfm5i3azrp+/YJ4CvANnnbrwZ2KVnuV0jfj2aSipnAvoVtvLcwbkLe315ZGPY4S2sqpwLn1ZXB\nImD9QhnUjjvPUji5k2oqHiiJ6Srgo4X+zXPZjqpfbsn8A7HfbpjXuUJh/DkMXKJwNcv+8n9r2fJK\n4iw9DtDNeaTsr6fGjOuRqkKL5jSaMCL+DvwXMAXoknSepLE9LL+n1paN1r1eD/M0Yz1evh1zgPUL\n/Y8Vuv9NOok1q6sX85Zuo6S9JU2X9ISkp0iZ4Su7WdbDdf3XkQ4iu5AOsp1AB2nnuK5kGe8iXfuc\nkxvg1BpmjQM+nVt1P5nj2YAGn0dEXEuqhvsJaV/4X0llZfBE5D2WVH0LqVaBwrAey76JsvpnRCzs\naTmFbXgB+CXw/txy+WDSgaBZvdl/noiIxQ2mfyXpRDG3MK50P42I50m/KFbJd3XUGsHNApC0jqTz\nJT0s6V+kmptaGb2a7r/r67Hs97U4ri9xQvrl/m9SIvsx4FFJl0vagsY+SKqd+qukmyTtk4ePAw6q\n2zd3BorHn7LP4xzSL/sLcrl8W9LovMzlcky1Zf4v3X//ipY5xkRqM/AEy5ZJ/fd1GZEa3C6KiGeA\nT5GqusfncXdFxG4R8QbSZd4jSTUFZ5CSqaMo31+/Q6qenibpfkmfr42Q9AFJMyU9lbd567ptLh7b\nns+xPF43rLivL9lnchk8Sd0xQ9KrSAnjbbXPEPg9qdarkfrj9xzS/rduyfRl+rzf5hiejGXbpPR4\n90AvdPd9a6ThcQD6dB55mZ4ShUdZttAgXVJoKCIuiIg3k75kkH5NQ3mryrLhNY3W/Ujufo60c9UU\nDwo9LfcR0peuftn1B8pWaLiNkpYnVTd+h3TNdE3Sl6e7hjj12309KTF4EykxuJ6UJOxCSaIQEbdF\nxAHAq0jVfxfmUfNIVYFr5b81I2KViPhlyXJOiYjtSZcPtiBd3x4UTZZVT/tEo/Fnk2qf9gCei4ib\nBiDc3nicdFliXGHYOJrYT/NJZtVILa+3yYO/SfpFtHVErEHatloZ9fRdf5T0K6oYR63M+hxnjvWP\nEfE20nf4XuD0kun+HhGHRMSrSJ/1xZJWJO2bZ9ftm6tGxHebWPeiiPhaRGxNare0L6lqfR6pRmHt\nwjLXiIiJzWwT6RizpDzyXQhrs2xy0NM+WaS6/0Unk66tv0CqUbgtIuYAy0l62QkhIp6LiM9ExGbA\nfsBxknaTtBFwGumOgTXz9+juknU2a8k+k38srMXL94vHScnb1oXPcI2IWL1kmcuUbe5eyLJJTDP6\ns98+CqwlaYXCsA3LJubln3WtsWnZOazR963X+ngeeZmeEoXpwCJJn5A0RtKBpCrRRgFtnne25UlV\nMs+TDkqQPsCN+3BP6bqFdb+H1J7hijzuDuB9edz2wLsL8/0zr3uzkuVeAbxW0vskjc63F40nNfBp\ntXUabOPvSA1zlgcej4jFkvYmXfvvjVqNwooR8QipwctepAPWzPqJJS2n9LyJ1SK1wq41DIJ08P6o\npB3ztCtLenuj27AkbS9pR0ljSPvBf1i6Lwyk2v40EGX1GKk6comImEGK+3/ouTZhwFtS51qGi4Bv\nSFpF0jjgv5uIpcyqpCreBZLWZ9nkrafv+oXAEZLGS1oJ+HJdnBf2Jc5cy7FfXubCHF/DO1IkHVo4\n8T1NOvguJtWM7CvpbZJGSVpB6f7wHmsfJXVImqB0K/WzOYaXIuIxUgO1kyWtqmRTSbv0tMzsfOBI\nSRMlvYKUpM2IiKZ+dUraStK2eXtWITUgfJhUe1Cc7q2kyyO/z4MeAPaQtDXpO/FEg2XvI6l2bFxA\nuhywmHRpYDHweF7vkaTLC92G2sP4t0t6Yz4vfA2Yno9FS+QaxdOBH+TaBSStL6nsO3w+8N+SNs5l\n8w3ggrpauR71Z7+NiLmky6JT8nHzDaQks6hYNl0Uji+5FmY+qcZylNKtk8Xz1YXAJ3M5rElqZNoX\nA3Fs7D5RyFW1B5KqtZ4gNW75VcnkrwC+TTpJP0L6RXpCHncRqdCekHRrbfGNVlnXP4N0LfRx0k72\nroh4Ko/7EvAaUlXWZFLjqlrcz5N2nj/lqqxlkpuIeJLUgv8zedmfAfYpLLs3mX5P29CTm3j5Nv4r\nIp4lNSy7KFfFvY/0C7/p9UbEfaQDwfW5fwGpyvHGQnV//byHAQ8qVU0fTWrwSUTcRmq7cEqO52+k\na2ONrEb64j9Juvb/OOkafzPqt6O78owcW2/LqpFvA1/K+8txheFnkw6W5/Ywf1l59kVx/k+Qfm09\nQPocz42IqU3OW+8rwP8jNZi9nMJ3uafvekT8gdQY7RrSZ3913bI/2cc4R5Hu+JhP2k92IV2GaGQv\n4G5Jz5B+Rb83Il6IiIdJ7S1OJB1/5pC+07XjW3dlMpb0i+tp0q/na1n6WX+AdJCdTdqXL2LZX33l\nGxdxNekY9eu8bZuQ9sslk/SwiHVJl76eJrVU3xB4RxRuo8wn35NIZV/zSdIlkmnAx+q+5zWvBa6S\ntIDUDuUnEXFdRNxDSopnkBLnrUntN7rd1B76zyNdjn6C1FD0/SXT1u62mJGPPdNIbQ8a+T/Syfx6\n0vHs3yxbBr2pPezrfgtwKKkW6nHSXTUXkNo8NJr2h8B7cvX/D/Kwo0ntax4n/VD9U2H600mXxO4k\nJST1592mtnGAjo1LGpFZCUlnkpKKrmK1o6RPkG5ZWQT8LiKOz8NPIF0fXAR8KiKmdbPsw4EPRkSz\nv1KsxZSefvZhf0ZmvSNpKukW5C/3OPEIIOkC4J6I+ErVsQw0v+uhZ1NJz4JYQlIHqZppm3z993t5\n+HjSLY7jSQ1GTu3D5RYbInJ1+DGkO0bMzJbIl1g3zZel9iK19/hN1XENBicKPYiIG0kPFyn6GOnW\np0V5mlqr3/1J18oWRXok8n2UtOmwoS1fH/0HqVHR+RWHYzYcjfTq6rGkO8kWkC7LfTQaPzJ62POl\nhybkRi6X1y49SJpJus6zF6mx3mci4jZJPyY11jkvT3cG6X7tX1cUupmZWb8MmXc9DDNjSE9xnCRp\nB1IjJ79Fz8zMRhwnCn0zj9SamYi4RdJLktYmtW4u3nu+ASX35EpyVY6ZWR9EhNt+tZDbKDRHLHtP\n7G9Iz3JH0uaklz49AVwGvFfS8pI2Id2+eXPZQmfMmNHjY3bb4W/y5MmVxzBU/lwWLguXRfd/1npO\nFHog6TzSeyA2lzQ3P4Tk/4BNlR6Nex7pfmsiYjbpQRmzSQ91Oia62bOP/u+jBzt8MzOzfvGlhx5E\nxCElow4rmf5bpHfC92hx7x4kZmZm1nKuUbDKdXR0VB3CkOGyWMplsZTLwqrk2yMrIikmTJrArOmz\nqg7FzGzYkES4MWNLuUbBzMzMSjlRMDMzs1JOFMzMzKyUEwUzMzMr5UTBzMzMSjlRMDMzs1JOFMzM\nzKyUEwUzMzMr5UTBzMzMSjlRMDMzs1JOFMzMzKyUEwUzMzMr5UTBzMzMSjlRMDMzs1JOFMzMzKyU\nEwUzMzMr5UTBzMzMSjlR6IGkMyV1SbqrwbhPS1osaa3CsBMk3SfpHklva220ZmZmA8uJQs+mAnvW\nD5S0AfBWYE5h2HjgIGA8sDdwqiS1KE4zM7MB50ShBxFxI/BUg1EnA5+tG7Y/cEFELIqIh4D7gB0H\nN0IzM7PB40ShDyTtB8yLiFl1o9YH5hX65+dhZmZmw9KYqgMYbiStCJxIuuxgZmY2ojlR6L3NgI2B\nO3P7gw2A2yXtSKpB2Kgw7QZ5WENd87qYMmUKAB0dHXR0dAxOxGZmw1RnZyednZ1Vh9HWFBFVxzDk\nSdoYuDwitmkw7kFgu4h4StJWwC+AnUiXHP4IvDYaFLKkmDBpArOm11+9MDOzMpKICDcSbyG3UeiB\npPOAPwObS5or6ci6SQIQQETMBi4EZgNXAMc0ShLMzMyGC9coVMQ1CmZmvecahdZzjYKZmZmVcqJg\nZmZmpZwomJmZWSknCmZmZlbKiYKZmZmVcqJgZmZmpZwomJmZWSknCmZmZlbKiYKZmZmVcqJgZmZm\npZwomJmZWSknCmZmZlbKiYKZmZmVcqJgZmZmpZwomJmZWSknCmZmZlbKiYKZmZmVcqJgZmZmpZwo\nmJmZWSknCj2QdKakLkl3FYZ9R9I9ku6Q9CtJqxXGnSDpvjz+bdVEbWZmNjCcKPRsKrBn3bBpwNYR\n8TrgPuAEAElbAQcB44G9gVMlqYWxmpmZDai2SBQkjZL0ekn7SNpd0jrNzhsRNwJP1Q27KiIW594Z\nwAa5ez/ggohYFBEPkZKIHfu/BWZmZtUYU3UAg0nSZsDngbeQTtr/BFYANpf0b+BnwFmFk35fHAWc\nn7vXB6YXxs3Pw8zMzIalEZ0oAF8Hfgp8JCKiOCLXKhwCHAac1ZeFS/oCsDAizu9x4ga65nUxZcoU\nADo6Oujo6OjLYszMRqzOzk46OzurDqOtqe78aQ1IGgdcHhETC8OOAD4M7B4RL+RhxwMRESfl/j8A\nkyPipgbLjAmTJjBr+qxWbIKZ2YggiYhw268WGtE1CpIO7G58RPy62UXlv9py9wI+C+xSSxKyy4Bf\nSDqZdMnhNcDNvQrazMxsCBnRiQKwb/6/DvBG4JrcvxvwZ6DHREHSeUAHsLakucBk4ERgeeCP+aaG\nGRFxTETMlnQhMBtYCBxTf8nDzMxsOGmLSw+SpgGHR8Sjuf/VwM8jov62x1bG5EsPZma95EsPrdcW\nt0cCG9aShKwL2KiqYMzMzIaLkX7poeZqSVey9DbG9wJXVRiPmZnZsNAWiUJEfFzSO4Fd8qDTIuKS\nKmMyMzMbDtoiUchuBxZExFWSVpK0akQsqDooMzOzoawt2ihI+jBwMelJjJBuXfxNdRGZmZkND22R\nKADHAjsDzwBExH2kWybNzMysG+2SKLwQES/WeiSNAUb+faFmZmb91C6JwnWSTgRWlPRW4CLg8opj\nMjMzG/LaJVE4nvTmyFnAR4ArIuIL1YZkZmY29LXLXQ+HAhdExOm1AZLeERG/rTAmMzOzIa9dahR+\nDNwgaXxh2FerCsbMzGy4aJdE4UHgKOBiSe/Jw/yscDMzsx60y6WHiIjbJe0KnC9pJ2B01UGZmZkN\nde1So/AoQEQ8DuxJujVyQqURmZmZDQNtkShExD6F7sUR8dmIaIttNzMz648RfelB0g8i4r8kXU6D\nByxFxH4VhGVmZjZsjOhEATgn//9epVGYmZkNUyM6UYiI2/L/66qOxczMbDga0YmCpFl0806HiJjY\nwnDMzMyGnRGdKADv6O8CJJ2Zl9NVSywkrQn8EhgHPAQcFBFP53EnkJ7ZsAj4VERM628MZmZmVRnR\nLf8jYk53f00uZirplsqi44GrImIL4BrgBABJWwEHAeOBvYFTJfnBTmZmNmyN6EShRtIkSbdIelbS\ni5JekvRMM/NGxI3AU3WD9wfOyt1nAQfk7v1I75RYFBEPAfcBO/Z/C8zMzKrRFokCcApwMOnEvSLw\nIeAn/VjeOhHRBRARjwHr5OHrA/MK083Pw8zMzIalkd5GYYmIuF/S6Ih4CZgqaSb5ksFALL4vM3XN\n62LKlCkAdHR00NHRMUDhmJmNDJ2dnXR2dlYdRltTRJ/OccOKpOuBtwBnAI+RHul8RERs2+T844DL\nC40Z7wE6IqJL0ljg2ogYL+l40nslTsrT/QGYHBE3NVhmTJg0gVnTZw3EJpqZtQVJRITbfrVQu1x6\nOIz0EqiPA88BGwLv6sX8Ytm3TV4GHJG7DwcuLQx/n6TlJW0CvAa4ue9hm5mZVastLj0U7nB4HvhK\nb+aVdB7QAawtaS4wGfg2cJGko4A5pDsdiIjZki4EZgMLgWOiHapszMxsxGqXSw/vAL5Geu7BGFLt\nQETEahXG5EsPZma95EsPrdcWNQrAD4ADgVn+hW9mZta8dmmjMA/4i5MEMzOz3mmXGoXPAVdIug54\noTYwIr5fXUhmZmZDX7skCt8AngVWAJavOBYzM7Nho10ShfUiYkLVQZiZmQ037dJG4QpJb6s6CDMz\ns+GmXRKFjwF/kPS8pGckLWj2pVBmZmbtbMRfesived46IuZWHYuZmdlwM+JrFPItkb+rOg4zM7Ph\naMQnCtntknaoOggzM7PhZsRfesh2Ag6VNIf0UqjaI5wnVhuWmZnZ0NYuicKeVQdgZmY2HLXFpYf8\n9sg1gH3z3xqFN0qamZlZibZIFCR9CvgFsE7+O1fSJ6qNyszMbOhrl0sPHwR2iojnACSdBEwHflxp\nVGZmZkNcW9QokBovvlTofykPMzMzs260S43CVOAmSZfk/gOAMyuMx8zMbFhoi0QhIr6fXzG9cx50\nZETMrDImMzOz4aAtEoXsDuBR8jZL2siPdTYzM+teWyQK+Q6HyUAXS9snBNCvBy5J+m9SQ8nFwCzg\nSGBl4JfAOOAh4KCIeLo/6zEzM6tKuzRm/BSwRURsHRETI2Kb/j6VUdJ6wCeA7fKyxgAHA8cDV0XE\nFsA1wAn9jN3MzKwy7ZIozAMG41f9aGBlSWOAFYH5wP7AWXn8WaSGk2ZmZsNSW1x6AB4AOiX9Dnih\nNjAivt/XBUbEI5L+B5gL/BuYFhFXSVo3IrryNI9JWqefsZuZmVWmXRKFuflv+fzXb5LWINUejCPV\nVlwk6VBS24ei+v4luuZ1MWXKFAA6Ojro6OgYiNDMzEaMzs5OOjs7qw6jrSmi9Dxm3ZD0bmDPiPhw\n7j8MmATsDnRERJekscC1ETG+wfwxYdIEZk2f1dK4zcyGM0lEhB+Y10Ijuo2CpNMlbVMybmVJR+Va\ngL6YC0yStIIkAXsAs4HLgCPyNIcDl/Zx+WZmZpUb6ZcefgJ8KScLfwH+CawAvBZYDfg/0suiei0i\nbpZ0MTATWJj/nwasClwo6ShgDnBQfzfCzMysKm1x6UHSKsD2wKuB54F7IuLeimPypQczs17ypYfW\nG+k1CgBExLNAZ9VxmJmZDTcjuo2CmZmZ9Y8TBTMzMyvVFolC2Z0PZmZm1r22SBSAUyXdLOkYSatX\nHYyZmdlw0RaJQkS8GTgU2BC4TdJ5kt5acVhmZmZDXlskCgARcR/wReDzwK7AjyT9VdKB1UZmZmY2\ndLVFoiBpoqSTgXtIj1jeNz9WeXfg5EqDMzMzG8La4jkKwI+BM4ATI+L52sD8BsgvVheWmZnZ0NYu\nicI+wPMR8RKApFHAChHx74g4p9rQzMzMhq62uPQAXAWsWOhfKQ8zMzOzbrRLorBCfowzsOSRzitV\nGI+Zmdmw0C6JwnOStqv1SPp/pJdDmZmZWTfapY3CfwEXSXoEEDAWeG+1IZmZmQ19bZEoRMQtkrYE\ntsiD7o2IhVXGZGZmNhy0RaKQ7QBsTNrm7fI7zc+uNiQzM7OhrS0SBUnnAJsBdwAv5cEBOFEwMzPr\nRlskCsD2wFYREVUHYmZmNpy0y10PfyE1YDQzM7NeaJcahVcCsyXdDLxQGxgR+/VnofmV1WcAE4DF\nwFHA34BfAuOAh4CDIuLp/qzHzMysKu2SKEwZpOX+ELgiIt4jaQywMnAicFVEfEfS54ETgOMHaf1m\nZmaDqi0uPUTEdaRf98vl7luA2/uzTEmrAW+OiKl5HYtyzcH+wFl5srOAA/qzHjMzsyq1RaIg6cPA\nxcDP8qD1gd/0c7GbAI9LmirpdkmnSVoJWDciugAi4jFgnX6ux8zMrDLtcunhWGBH4CaAiLhPUn9P\n4GOA7YBjI+JWSSeTLjHU31lReqdF17wupkyZAkBHRwcdHR39DMnMbGTp7Oyks7Oz6jDamtrhjkFJ\nN0XETpJmRsTrc3uC2yNiYj+WuS4wPSI2zf1vIiUKmwEdEdElaSxwbUSMbzB/TJg0gVnTZ/U1BDOz\ntpMflqeq42gnbXHpAbhO0onAipLeClwEXN6fBebLC/MkbZ4H7QHcDVwGHJGHHQ5c2p/1mJmZVald\nLj0cD3wQmAV8BLiCdFtjf30S+IWk5YAHgCOB0cCFko4C5gAHDcB6zMzMKtEWlx6GIl96MDPrPV96\naL22qFGQ9CANGhXW2heYmZlZY22RKJDe9VCzAvAeYK2KYjEzMxs22qIxY0Q8UfibHxE/APapOi4z\nM7Ohri1qFCRtV+gdRaphaIttNzMz6492OVn+T6F7EfllTdWEYmZmNny0RaIQEbtVHYOZmdlw1BaJ\ngqTjuhsfEd9vVSxmZmbDSVskCqQ2CTuQnpoIsC9wM3BfZRGZmZkNA+2SKGwAbBcRCwAkTQF+FxHv\nrzQqMzOVsbclAAAQNUlEQVSzIa4tbo8E1gVeLPS/mIeZmZlZN9qlRuFs4GZJl+T+A4CzKozHzMxs\nWGiLRCEiviHp98Cb86AjI2JmlTGZmZkNB+1y6QFgJeCZiPgh8LCkTaoOyMzMbKhri0RB0mTg88AJ\nedBywLnVRWRmZjY8tEWiALwT2A94DiAiHgFWrTQiMzOzYaBdEoUXIyLIr5qWtHLF8ZiZmQ0L7ZIo\nXCjpZ8Aakj4MXAWcXnFMZmZmQ1673PXwPUlvBZ4BtgC+HBF/rDgsMzOzIW/EJwqSRgNX5RdDOTkw\nMzPrhRF/6SEiXgIWS1p9oJctaZSk2yVdlvvXlDRN0r2SrhyMdZqZmbXSiE8UsmeBWZLOlPSj2t8A\nLPdTwOxC//Gk2ostgGtYejummZnZsDTiLz1kv85/A0bSBsDbgW8AtddY7w/smrvPAjpJyYOZmdmw\nNKITBUkbRcTciBiM9zqcDHwWKF5eWDciugAi4jFJ6wzCes3MzFpmRCcKwG+A7QAk/Soi3jUQC5W0\nD9AVEXdI6uhm0uhuOV3zupgyZQoAHR0ddHR0tygzs/bT2dlJZ2dn1WG0NaXnEI1MkmZGxOvruwdg\nud8E3g8sAlYkPeXxEmB7oCMiuiSNBa6NiPEly4gJkyYwa/qsgQjJzKwtSCIiVHUc7WSkN2aMku7+\nLTTixIjYKCI2Bd4HXBMRhwGXA0fkyQ4HLh2odZqZmVVhpF962FbSM4CAFXM3uT8iYrUBXt+3SU+B\nPAqYAxw0wMs3MzNrqRGdKETE6Bas4zrgutz9JPCWwV6nmZlZq4z0Sw9mZmbWD04UzMzMrJQTBTMz\nMyvlRMHMzMxKOVEwMzOzUk4UzMzMrJQTBTNbxp4H7Mn4Hcaz5wF7Vh2KmQ0BThTMbBlz589l7MfH\nMnf+3KpDMbMhwImCmZmZlXKiYGZmZqWcKJiZmVkpJwpmZmZWyomCmZmZlXKiYGZmZqWcKJiZmVkp\nJwpmZmZWyomCmZmZlXKiYGZmZqWcKPSRpA0kXSPpbkmzJH0yD19T0jRJ90q6UtLqVcdqZmbWV04U\n+m4RcFxEbA28AThW0pbA8cBVEbEFcA1wQoUxmpmZ9YsThT6KiMci4o7c/SxwD7ABsD9wVp7sLOCA\naiI0MzPrPycKA0DSxsDrgBnAuhHRBSmZANapLjIzM7P+caLQT5JWAS4GPpVrFqJukvp+MzOzYWNM\n1QEMZ5LGkJKEcyLi0jy4S9K6EdElaSzwj7L5u+Z1MWXKFAA6Ojro6OgY5IjNzIaXzs5OOjs7qw6j\nrSnCP3j7StLZwOMRcVxh2EnAkxFxkqTPA2tGxPEN5o0JkyYwa/qsFkZs1rPxO4xn7MfH8tgpj3HP\nLfdUHY7ZMiQREao6jnbiGoU+krQzcCgwS9JM0iWGE4GTgAslHQXMAQ6qLkozM7P+caLQRxHxJ2B0\nyei3tDIWMzOzweLGjGZmZlbKiYKZmZmVcqJgZmZmpZwomJmZWSknCmZmZlbKiYKZmZmVcqJgZmZm\npZwomJmZWSknCmZmZlbKiYKZmZmV8iOczcxsyNvzgD2ZO39u1WG0JdcomJnZkDd3/lzGfnxs1WG0\nJScKZmZmVsqJgpmZmZVyomBmZmalnCiYmZlZKScKZmZmVsqJgpmZmZVyomBmZmalnCgMAkl7Sfqr\npL9J+nzV8ZiZmfWVE4UBJmkUcAqwJ7A1cLCkLauNamjr7OysOoQhw2WxlMtiKZeFVcmJwsDbEbgv\nIuZExELgAmD/imMa0nwQXMplsZTLYimXhVXJicLAWx+YV+h/OA8zMzMbdpwoVGiF5VeoOgQzM7Nu\nKSKqjmFEkTQJmBIRe+X+44GIiJPqpnPBm5n1QUSo6hjaiROFASZpNHAvsAfwKHAzcHBE3FNpYGZm\nZn0wpuoARpqIeEnSx4FppEs7ZzpJMDOz4co1CmZmZlbKjRkHUTMPXpL0I0n3SbpD0utaHWOr9FQW\nkg6RdGf+u1HSNlXE2QrNPpBL0g6SFko6sJXxtVKT35EOSTMl/UXSta2OsVWa+I6sJumyfKyYJemI\nCsJsCUlnSuqSdFc307TFsXNIiAj/DcIfKQm7HxgHLAfcAWxZN83ewO9y907AjKrjrrAsJgGr5+69\n2rksCtNdDfwWOLDquCvcL1YH7gbWz/2vrDruCsviBOBbtXIAngDGVB37IJXHm4DXAXeVjG+LY+dQ\n+XONwuBp5sFL+wNnA0TETcDqktZtbZgt0WNZRMSMiHg6985g5D57otkHcn0CuBj4RyuDa7FmyuIQ\n4FcRMR8gIh5vcYyt0kxZBLBq7l4VeCIiFrUwxpaJiBuBp7qZpF2OnUOCE4XB08yDl+qnmd9gmpGg\ntw+h+hDw+0GNqDo9loWk9YADIuKnwEi+DayZ/WJzYC1J10q6RdJhLYuutZopi1OArSQ9AtwJfKpF\nsQ1F7XLsHBJ814MNKZJ2A44kVT22qx8AxWvUIzlZ6MkYYDtgd2BlYLqk6RFxf7VhVWJPYGZE7C5p\nM+CPkiZGxLNVB2YjmxOFwTMf2KjQv0EeVj/Nhj1MMxI0UxZImgicBuwVEd1VOw5nzZTF9sAFkkS6\nFr23pIURcVmLYmyVZsriYeDxiPgP8B9J1wPbkq7njyTNlMWRwLcAIuLvkh4EtgRubUmEQ0u7HDuH\nBF96GDy3AK+RNE7S8sD7gPoD/WXAB2DJEx3/FRFdrQ2zJXosC0kbAb8CDouIv1cQY6v0WBYRsWn+\n24TUTuGYEZgkQHPfkUuBN0kaLWklUsO1kfhckmbKYg7wFoB8PX5z4IGWRtlaorw2rV2OnUOCaxQG\nSZQ8eEnSR9LoOC0irpD0dkn3A8+RfjGMOM2UBfAlYC3g1PxLemFE7Fhd1IOjybJYZpaWB9kiTX5H\n/irpSuAu4CXgtIiYXWHYg6LJ/eLrwM8Ltwx+LiKerCjkQSXpPKADWFvSXGAysDxtduwcKvzAJTMz\nMyvlSw9mZmZWyomCmZmZlXKiYGZmZqWcKJiZmVkpJwpmZmZWyomCmZmZlXKiYEOepK9I2r2P835E\n0vsHOqa87N9KWq0X0+8r6XO5e7Kk43q5vuL8+0vaspfz97kcqyBpV0mXl4x7UNJag7ju8/Pri3v9\nPoUc9xsGIy6zKviBSzbkRcTkRsMljYqIxT3M+7PBiQoi4h29nP5yoOGJryeSRtfNfwDpFdR/7cX6\nG5bjEFf2oJdBewCMpLHA9hHx2j4uogN4Fpjei3WOjoiX+rg+s0HlGgVrqfyI2nsknStptqQLJa2Q\nx31J0k2S7pL0v4V5pko6MHc/KOnbkm4FDs3/kbStpMWSNsj990taofjLXdInJd2dfymel4etJOlM\nSTMk3SZp3wYxj5V0naTbc2w7F2JZq7BNUyXdm7dtD0k35v7t8/SHS/pxg+V/SNLNkmZKuqhQHlMl\n/VTSdOCk2vz51+p+wHdyTJtKuq2wvNcU+7spxyl5m++UtHmD6Q+X9CtJv8/bcVJh3MG5LO6S9O3C\n8AWSvp7L+M+SXlWyH5yat3mWpMmF4XvlsrwVOLAwfC1JV+bpT6fwaF9Jx+Xhd9VqAPJnMlvSaZL+\nIukPkl6Rx71sP6hzJbBeLtudc/n+XuntldfVykrSOwr7zTRJr5I0Dvgo8F+F+ZeUe62M8v9dJV0v\n6VLg7jzsUKXvwO35s5ekUXkZd+XPqp3fGmlViAj/+a9lf8A4YDEwKfefCRyXu9coTHc2sE/ungoc\nmLsfBD5TmG4WsApwLHATcDDp5Tp/yuMnF5Y/H1gud6+W/38DOCR3rw7cC6xYF/NxwAm5W8DKufsB\n0mOnxwEvAlvl4bcCZ+Tu/YBLcvfhwI8axLVmYV1fA44tbPdlhXHF+ZeUSe6/GphY2KZjG5R9fTke\nk7s/BpzeYPrDSS9fWgV4BfAQ6VW+rya9d2At0o+Nq4H98jyLgbfn7pOAE0v2gzXy/1HAtcCEvI65\nwKZ53C9r2w/8EPhi7n476XHOa5HeLHknsALp7ZJ/Ib00qvaZbFNYVu1zftl+0GAfvavQfxWwWe7e\nEbi6tr8Upvkg8N36z7bks3om/98VWABslPu3JL3DYHTu/wnw/ryN0wrzvyxm//lvMP9co2BVmBsR\nM3L3uSx9pfQe+RfaXcBuwNYl8/+y0P3nPP8uwDdJB983Azc0mO9O4DxJh5JONABvA46XNBPoJD1P\nfqO6+W4BjpT0ZdLJ+Lk8vPjCmgdj6TsI7iadPCElMuNKtqNmYv5leRdwCMtu90U9zFtzZo5xFPBe\noNEv5XqX5P+3dRPj1RHxbES8QNquccAOwLUR8WSkSz+/IJU/wIsRcUVhuRuXLPd9udZjJrBV/tsS\neCAiai86Orcw/S61/rz82ttF30RKxP6TP5dfkz5/SJ/JrAaxNNoPGpK0MvBG4KK8j/wMWDeP3jDX\nctwFfIby/bU7N0fE3Ny9BykpuCWva3dgU1JCuomkH0rak5RcmLWMEwUbCiJXC/+E9MtrInAG6Vdi\nI88Vum8gnRg2iohLSb8md6ZxorAPcApLD8ajSSf7d0XE6/PfJhFx7zLBRdxAOlHNJ72Up1HjyBcK\n3YsL/YvpuS3QVNKv+4nAV1l2u59rPMvL/Ir0S/sdwK3R3Gu6azG+1E2M9dtVm67srX4LC90vAWNy\n1fnMXJ0+RdLGwKeB3SJiW+AKlm5z2XLrNTNdMfbiNtbvB90dB0cBT0XEdoV9ZEIe92NSDc9E0uWG\nsv11UV4OkkRKRmuKn6+AswrrGh8RX42If5H2607gI6TvhlnLOFGwKmwkaafcfQhwI+kgG8ATklYB\n3t3ksm4gVc/el/ufJJ0wb2y03oi4DjgeWI1UVX0l8MnaBJJeVz+T0iuw/xERZ5IO0ts1WHazJ7hG\nVgEek7QccGiT8ywgbQMA+Rf/lcBPSYnHYLoZ2CW3GxhNutzTWTZxRCzOJ77tImIKKe5ngQVKr0ve\nO0/6V2CcpE1y/8GFxVxPLhtJewNr5OE3AAcotUdZGXgnS5PEss+kfj9YpcE0yrEvAB6UtGR/lDQx\nd64GPJK7Dy/Mu8xnQ7pks33u3h9YriSuq4F319p1SFpT0kaS1iZdjriE9JbV15fMbzYonChYFe4F\njpU0m3TA/2lEPA2cTqre/j3pZFQTJd1ExJzceV3+fyPp3fRPF6eTNAY4V9KdpGroH0bEM6Q2Acvl\nhmKzSL/o63UAd0q6HTgI+EEPcfW2Rf6XSdt7A3BPk8u5APhsbkhXO7H+gvTLeVrJPP2Jcck8EfEY\n6STbSbp0cGtE/LbZ5UbEXcAdpG09l5zU5WTnI8AVuTFjV2G2r5CSk1mkOz7m5nlmAj8nXR6aTnoN\n9Z1lsXSzHzTc1uxQ4IO58eNfSO1OajFdLOkW4J+F6S8H3llrzEjar3fNlxMmUVJLFBH3AF8EpuX4\npgFjSe1COvP855DK3qxl/Jppa6ncKvy3EbFN1bGMNJI+TWroNhxvgzSzIcrPUbAqODsdYJJ+TWr4\nNmweqGRmw4NrFMzMzKyU2yiYmZlZKScKZmZmVsqJgpmZmZVyomBmZmalnCiYmZlZKScKZmZmVur/\nA+Ok9qzWEqCZAAAAAElFTkSuQmCC\n",
"text/plain": [
"<matplotlib.figure.Figure at 0x7fd32ab8f550>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"%%time\n",
"# nooooo dont let plotly ruin this for us!\n",
"vals_css = sims2.collect() # maybe we need to subsample!\n",
"temp3 = []\n",
"for i in vals_css:\n",
" temp3.append(i[1]) #store it on the client machine (locally) as a list\n",
"n, bins, patches = plt.hist(temp3, 200, normed=1, facecolor='green', alpha=0.75)\n",
"plt.xlabel('pairwise similarity in non-addons features')\n",
"plt.ylabel('Frequency (normalized)')\n",
"plt.title('distribution in pairwise similarity in non-addons sense for 5% sample of longitudinal')"
]
},
{
"cell_type": "code",
"execution_count": 136,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(-0.049752280687800708, 0.62480983586326322)\n"
]
}
],
"source": [
"from scipy.stats.stats import pearsonr\n",
"p = pearsonr(temp2[1:100], temp3[1:100]) # must be the same shape!!\n",
"print p"
]
},
{
"cell_type": "code",
"execution_count": 139,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[((u'2da6168f-b374-4e53-8530-b142eb33d9f8',\n",
" u'dc61df84-abe3-412c-9071-c9d523df9649'),\n",
" 0.0,\n",
" 0.0)]"
]
},
"execution_count": 139,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sims_combined_rdd = sims.join(sims2).map(lambda p: ((p[0]), p[1][0], p[1][1]))\n",
"sims_combined_rdd.take(1)"
]
},
{
"cell_type": "code",
"execution_count": 140,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"ADDON_SIM_THRESHOLD = 0.42 #hehe\n",
"\n",
"ss_rdd = sims_combined_rdd.filter(lambda x: x[1] >= ADDON_SIM_THRESHOLD).map(lambda p: p[1])\n",
"ds_rdd = sims_combined_rdd.filter(lambda x: x[1] < ADDON_SIM_THRESHOLD).map(lambda p: p[1])\n",
"\n",
"LR_vals_x = np.arange(0,1,0.0001)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Fit and visualize all 4 distributions -- First up: All normal assumptions"
]
},
{
"cell_type": "code",
"execution_count": 141,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.925513286593 0.158513615\n",
"0.0022770653204 0.0208416550357\n"
]
}
],
"source": [
"import scipy.stats\n",
"import scipy.optimize\n",
"\n",
"def exponential_func(x, a, b, c):\n",
" return a * np.exp(-b * x) + c\n",
"\n",
"ss_mu = ss_rdd.mean()\n",
"ss_sig = ss_rdd.sampleStdev()\n",
"ds_mu = ds_rdd.mean()\n",
"ds_sig = ds_rdd.sampleStdev()\n",
"\n",
"print ss_mu, ss_sig\n",
"print ds_mu, ds_sig\n",
"# Find density estimates for the given values\n",
"LR_vals_y_norm = scipy.stats.norm(ss_mu, ss_sig).pdf(LR_vals_x)/scipy.stats.norm(ds_mu, ds_sig).pdf(LR_vals_x) #LR right here!\n",
"\n",
"poly_model_norm = np.polyfit(LR_vals_x, LR_vals_y_norm, 3)# fit the 3rd order polynomial, make sure this is ok by exmaining residuals later on!popt, pcov = curve_fit(func, x, y, p0=(1, 1e-6, 1))\n",
"#popt, pcov = scipy.optimize.curve_fit(exponential_func, LR_vals_x, LR_vals_y_norm, method='lm')\n",
"\n",
"#exponential_model_norm = exponential_func(LR_vals_x, *popt)\n",
"\n",
"LR_vals_y_norm_prime = np.polyval(poly_model_norm, LR_vals_x)# sanity check!"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[<matplotlib.lines.Line2D at 0x7f6e01c7a9d0>]"
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAYEAAAEGCAYAAACD7ClEAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAFspJREFUeJzt3X2QXfV93/H3d/furoQeFklmBZKQhLGFbTDFQDDUHnsT\ndxqgNXhcj2O748RM22FS4zLjTJuk4w7q1DMhf7QGx06oXPJApy5OkwmQ2jRJHdYuMWBqQWRiVMA8\nGC0gAZKQVtI+3fvtH/dKurvalVa6V/ug3/s1c+aeh9/5nZ+OVvez33POvYrMRJJUpo65HoAkae4Y\nApJUMENAkgpmCEhSwQwBSSqYISBJBZuXIRARd0fEzojYdhL7/JOIqEXE5Y3l/oh4IiK2Nl4PRcQN\np2/UkrTwxHz8nEBEfBAYAu7JzEtn0H4p8G2gC7glM7dO2r4CeBZYl5nDp2HIkrQgzctKIDMfBvY0\nr4uIt0fEgxHxeER8LyI2NW3+D8DtwMg0XX4CeNAAkKSJ5mUITGML9d/yfw7418DvATQu/6zLzAeP\ns++ngP9++ocoSQtLZa4HMBMRsQT4+8D/iIhorO5qzP9H4Feam0/a91zgEuAvZmOskrSQLIgQoF6x\n7MnMy5tXRsRy4GJgoBEI5wL3R8QNTfcFPgn8WWZWZ3XEkrQAtHw5KCLWRcRfR8TfRcSPI+JfTdPu\nqxHxbEQ8GRGXzaTrxkRm7gdeiIhPNPV3aWbuy8y+zHx7Zl4APAp8dNKN4U/jpSBJmlI77gmMA1/M\nzIuBa4DPR8S7mhtExHXAhZn5TuBm4K7jdRgR3wR+AGyKiJ9FxE3APwX+WSNEngKmetwzabocFBEb\nqN8v+N6p//Ek6czV9kdEI+I+4Hcy87tN6+4CHsrMbzWWnwb6M3NnWw8uSTopbX06KCI2ApcBj03a\ntBZ4uWl5sLFOkjSH2hYCjQ9s/Qlwa2YOtatfSdLp05angyKiQj0A/mtm3j9Fk0Hg/KbldY11U/U1\n/z7CLEnzXGbGiVsdq12VwO8DP8nMO6fZ/gDwywARcTWw93j3AzLTKZPbbrttzscwHybPg+fCc3H8\nqRUtVwIR8QHqT+78OCKeoP6Ezr8FNtTfz3NLZn4nIq6PiOeAA8BNrR5XktS6lkMgM/8G6JxBu1ta\nPZYkqb0W0ncHFae/v3+uhzAveB6O8lwc5bloj3n3VdIRkfNtTJI0n0UEOcc3hiVJC5AhIEkFMwQk\nqWCGgCQVzBCQpIIZApJUMENAkgpmCEhSwQwBSSqYISBJBTMEJKlghoAkFcwQkKSCGQKStIBVq63t\nbwhI0gL2zDOt7W8ISNICZiUgSQWr1Vrb3xCQpAXMSkCSCmYISFLBvBwkSQWzEpCkglkJSFLBrAQk\nqWCGgCQVzMtBklQwKwFJKpiVgCQVzEpAkgpmCEhSwQwBSSqYISBJBTMEJKlg4+Ot7W8ISNICNi8q\ngYi4OyJ2RsS2abZ/OCL2RsTWxvSldhxXkkrXaghU2jMM/gD4HeCe47T5fmbe0KbjSZKYJ5VAZj4M\n7DlBs2jHsSRJR82LEJihayLiyYj4dkS8ZxaPK0lnrPlyOehEfgSsz8yDEXEdcB+wabrGmzdvPjLf\n399Pf3//6R6fJC0YAwMDDAwMAPDYY631FZnZ+oiAiNgA/HlmXjqDti8AV2Tm7im2ZbvGJElnuq98\nBb74xSAzT+mSezsvBwXTXPePiNVN81dRD59jAkCSdHLmxeWgiPgm0A+sioifAbcB3UBm5hbgExHx\nq8AYcAj4pXYcV5JKNy9CIDM/c4LtXwe+3o5jSZKOWkhPB0mS2syvjZCkglkJSFLBDAFJKpghIEkF\nMwQkqWCGgCQVzKeDJKlgVgKSVDBDQJIKZghIUsEMAUkqmCEgSQXz6SBJKpiVgCQVzBCQpIIZApJU\nMENAkgpmCEhSwXw6SJIKZiUgSQUzBCSpYIaAJBXMEJCkghkCklQwnw6SpIJZCUhSwQwBSSqYISBJ\nBfOegCQVzBCQpIKNjbW2vyEgSQuYlYAkFcxKQJIKZiUgSQWbF5VARNwdETsjYttx2nw1Ip6NiCcj\n4rJ2HFeSSjdfKoE/AH5xuo0RcR1wYWa+E7gZuKtNx5Wkos2LSiAzHwb2HKfJjcA9jbaPAb0Rsbod\nx5akUmXOn0rgRNYCLzctDzbWSZJOUbUKnZ2t9eGNYUlaoMbGoFJprY8Wd5+xQeD8puV1jXVT2rx5\n85H5/v5++vv7T9e4JGnBGRgYYGBggJERqNVa6ysysy2DioiNwJ9n5nun2HY98PnM/EcRcTVwR2Ze\nPU0/2a4xSdKZbPduuPBC2Ls3yMw4lT7aUglExDeBfmBVRPwMuA3oBjIzt2TmdyLi+oh4DjgA3NSO\n40pSycbHoaurtT7aEgKZ+ZkZtLmlHceSJNW1456AN4YlaYFqRyVgCEjSAmUlIEkFsxKQpIJZCUhS\nwawEJKlgVgKSVDArAUkqmJWAJBVsbMxKQJKKNT5uJSBJxbISkKSCeWNYkgrmjWFJKpiVgCQVzEpA\nkgo2MgI9Pa31YQhI0gI1Ogrd3a31YQhI0gJlJSBJBbMSkKSCWQlIUsGsBCSpYKOjVgKSVKyRESsB\nSSqWlYAkFcxKQJIKZiUgSQWzEpCkglkJSFLBrAQkqWBWApJUMCsBSSqYlYAkFcxKQJIKZiUgSQWz\nEpCkgs2bEIiIayNie0Q8ExG/PsX2D0fE3ojY2pi+1I7jSlLJDh2Cs85qrY9Kq4OIiA7ga8BHgFeA\nxyPi/szcPqnp9zPzhlaPJ0mqO3QIFi9urY92VAJXAc9m5kuZOQbcC9w4Rbtow7EkSUC1CuPj8+Ny\n0Frg5ablHY11k10TEU9GxLcj4j1tOK4kFevwpaBo8dfrli8HzdCPgPWZeTAirgPuAzbN0rEl6Yxz\n8GDrl4KgPSEwCKxvWl7XWHdEZg41zT8YEb8bESszc/dUHW7evPnIfH9/P/39/W0YpiSdGQYGBrjv\nvgFGR6Hp7fKURGa21kFEJ/D/qN8YfhX4IfDpzHy6qc3qzNzZmL8K+OPM3DhNf9nqmCTpTLd9O3zs\nY/XXiCAzT+nCUMuVQGZWI+IW4C+p32O4OzOfjoib65tzC/CJiPhVYAw4BPxSq8eVpJK148kgaNM9\ngcz8X8BFk9b956b5rwNfb8exJEntuyfgJ4YlaQFqxwfFwBCQpAWpXZeDDAFJWoAMAUkqmPcEJKlg\n3hOQpIIdPGgISFKx9u+HZcta78cQkKQFaN8+Q0CSimUlIEkFMwQkqWD798Py5a33YwhI0gJkJSBJ\nBfPGsCQVzEpAkgpmCEhSwQwBSSrUoUOQ6RfISVKR3nwTVq2COKX/VXgiQ0CSFpjDIdAOhoAkLTCG\ngCQV7M03YeXK9vRlCEjSArN7t5WAJBXLy0GSVLDXXoPVq9vTlyEgSQvM4CCsXduevgwBSVpgBgdh\nzZr29GUISNIC085KIDKzPT21SUTkfBuTJM0XtRosWgRDQ9DdXV8XEWTmKX1+uNLOwUmSTq8dO+Bt\nb6sHwP6R/fzg5R+01J8hIEkLxOC+Qf7LDx6h8/q/4cot/4ftb2znijVXtNSnl4MkaR4aGR/hidee\n4JGXH+GRHfVpeHyY88avYdEb1/CVWz/ElWuupKfS09LlIENAkuZYLWs8t/s5tr66lR8O/pBHdjzC\ntp3buGjVRVyz7hquXnc115x/DReuuJBbbw02bIBf+7Wj+3tPQJIWiGqtyvY3trP11a1sfXUrP3r1\nRzz52pOsOmsVV5x3BVeuuZLbP3I7V665kiXdS47Z//HH4eMfb994rAQk6TTZc2gPT+166sj0xGtP\nsG3nNtYsW8Pl513O5eddzhXnXcH7znsfKxef+BvhRkdhxYr6J4ab/1cxKwFJmkMHxw7yk9d/MuEN\n/6ldT7FvZB8X913MJedcwsV9F/PJiz/JZedeRu+i3lM6zmOPwaZN7flvJQ8zBCRpBqq1Ki+99RLP\nvPnMMdOuA7u46G0XcUnfJVxyziV84aovcEnfJazvXU+047//arj/frjhhrZ1B7TpclBEXAvcQf0T\nyHdn5m9P0earwHXAAeBzmfnkNH15OUjSnDg4dpCX9r7Ei3tf5IW9L/DCnhd4dvezPPPmMzy/53lW\nL13NRasuYtOqTROm9b3rqXSc3t+ph4dh40Z46CF497snbpvTy0ER0QF8DfgI8ArweETcn5nbm9pc\nB1yYme+MiPcDdwFXt3psSZqpzGTv8F4G9w+yY98OXtz74oTphb0v8NbwW6zvXc8FKy5gY+9GNp69\nkc9e+lk2rdrEO1a+g8Vdbfif3U/RHXfAVVcdGwCtakd0XQU8m5kvAUTEvcCNwPamNjcC9wBk5mMR\n0RsRqzNzZxuOP+syk0PjhzgweoDh8eGTmsZqY1RrVcZr44zXxqlm0/w065OJlVEwMfCby83J2yod\nFSodFTqj88j8kXUdU6yb1K6rs4vuzm56OnvoqfTQ09lTX27MH29dZ0fn6ftLkBoyk/2j+9l1YBev\nDb3G4L5BXtn/CoP7Bxnc35hvrOvq7GLNsjWsXbaWC86+gI1nb+Sjmz5af9M/eyPnLj2XjphfX6mW\nCd/4Btx5Jzz6aPv7b0cIrAVeblreQT0YjtdmsLFuyhC4996ZHfhkrxplQjXHOVjby1B1Nwdqe+qv\n1T0cqNVfD9X2MZxDHKruZzj3M1wbYri2n+Hafg7V9jOSQwzXhuiMLnpiCd0di+mKRdNO3U3zleih\nM7rooEJnVOhgMR1RoZMKHXTW56MxT4XFjfmJ3/M38Q/dHBBJTticJLWsUmOcsRynRn2+muNHX/Pw\nuuEJ22pZpco41RxlPEcZZ4Sx2gjjjNSXc4SxPLw89ToIuqKHCj1Uoj51HX5lUWO56ZVJy4129X3q\nr4fXVzjafqo+mvep0APMvFI+lZ8r+25P35nJKAc4lHs5mHs5lHs5UHuDoXydodrrTa+7Jqyr0M3S\njnNYFufS27GW3lhTf+24jEtiLR+ItfSetYaeWApjwO7GBLzemH44i3/OmRgaguefh+99D/r64Lvf\nhQ0bTq6PmZiXN4a//OXNR+b7+vrp6+uftm2tY4Thzl2MVHYy3LmT4cpOhjt3NV53MlLZxWjnbkY7\n9jDauZvxjgN01Xrprq6kp7qSrtoKeqor6a6uoLu2gkrtHLpqb6dSW8qS2jLOri2jkkvpqi2jUltG\nV20pldpSOug6Ziwnc/9nurbVxnQ6+u5oTJP/0tvR92RJklSpdYxQixGqMUKVpvkYoRrDjeVhqjFC\nrfE6GsMcOrJ+mCpvUY2dE9o2vx5Zz3TbR+nIbjpzEZ3Zc/SVScu5iA56qDS3o2fSfseuO7LP5LZT\nLAdxUuf7dP39tKvvJKkxylgcYCwOMN70enh+LIYmbBuLIUbjLUZi75FptOPofCc99OTZ9NTOrr/m\nSs6q9bEoz2FxbR0rau9jLX2cVTuHxXkOi2vnUOH4l2kOBhw8DefkZNufTNvFi+FDH6p/KOy97524\n78DAAAMDAzPv7HhjavUmbERcDWzOzGsby78BZPPN4Yi4C3goM7/VWN4OfHiqy0GHbwzXssbrB15n\nx74d7Ni3g5f3vXxkfse+Hbw69Co7h3ZycOwgfUv6WL10NauXrK7PL1nN6qX1+b4lfaxavIqVi1ey\nYvEKlvcsn3flnk6vWtYYrY4yMj7C8PgwI9XG66TlqdadcHmm7cZHGKmOHLm0tqiyiJ5K43WK5e7O\nbjo7OgmCjug4ZoporGeKddFBENSydmSqZnXC8uTp8PZqrcpodbR+vqojR87b8ZYrHRWWdC1hSfcS\nlnYvPTI/4XXS9rMXnT3l1Luol+7O7rn+kVlw5vpzAo8D74iIDcCrwKeAT09q8wDweeBbjdDYe7z7\nARfceQGv7H+F3p5e1i1fx7rl6zh/+fmsW76Oiy+8mLXL17Jm2Rr6lvSxYtGKtj6CpTNPR3SwqLKI\nRZVF9HJqz2e3Q2YyWh2dMiymCozDvwzVL+sd+8Z9ePuEdU1tO6NzQnB0dnROGShHtjfaH76/cziw\njrfc3dntL1ULXDsfEb2To4+I3h4RN1OvCLY02nwNuJb6I6I3ZebWafrKn+7+KWuWrWFRZVHLY5Ok\nM51fICdJBWslBKzjJKlghoAkFcwQkKSCGQKSVDBDQJIKZghIUsEMAUkqmCEgSQUzBCSpYIaAJBXM\nEJCkghkCklQwQ0CSCmYISFLBDAFJKpghIEkFMwQkqWCGgCQVzBCQpIIZApJUMENAkgpmCEhSwQwB\nSSqYISBJBTMEJKlghoAkFcwQkKSCGQKSVDBDQJIKZghIUsEMAUkqmCEgSQUzBCSpYIaAJBXMEJCk\nglVa2TkiVgDfAjYALwKfzMy3pmj3IvAWUAPGMvOqVo4rSWqPViuB3wD+d2ZeBPw18JvTtKsB/Zn5\nPgNg5gYGBuZ6CPOC5+Eoz8VRnov2aDUEbgT+qDH/R8DHpmkXbThWcfwhr/M8HOW5OMpz0R6tvjH3\nZeZOgMx8Deibpl0CfxURj0fEv2jxmJKkNjnhPYGI+CtgdfMq6m/qX5qieU7TzQcy89WIOId6GDyd\nmQ+f9GglSW0VmdO9b89g54inqV/r3xkR5wIPZea7T7DPbcD+zPxP02w/9QFJUqEyM05lv5aeDgIe\nAD4H/DbwK8D9kxtExFlAR2YORcQS4B8C/366Dk/1DyJJOnmtVgIrgT8Gzgdeov6I6N6IOA/4Rmb+\n44i4APgz6peKKsB/y8zbWx+6JKlVLYWAJGlhm5PHNiPi2ojYHhHPRMSvT9PmqxHxbEQ8GRGXzfYY\nZ8uJzkVEfCYi/rYxPRwR752Lcc6GmfxcNNr9XESMRcTHZ3N8s2mG/0b6I+KJiHgqIh6a7THOlhn8\nG1keEQ803it+HBGfm4NhzoqIuDsidkbEtuO0Obn3zsyc1Yl68DxH/VPGXcCTwLsmtbkO+HZj/v3A\no7M9znl0Lq4Gehvz15Z8LprafRf4n8DH53rcc/hz0Qv8HbC2sfy2uR73HJ6L3wR+6/B5AN4EKnM9\n9tN0Pj4IXAZsm2b7Sb93zkUlcBXwbGa+lJljwL3UP3TW7EbgHoDMfAzojYjVnHlOeC4y89E8+lUc\njwJrZ3mMs2UmPxcAXwD+BNg1m4ObZTM5F58B/jQzBwEy841ZHuNsmcm5SGBZY34Z8GZmjs/iGGdN\n1h+t33OcJif93jkXIbAWeLlpeQfHvrFNbjM4RZszwUzORbN/Djx4Wkc0d054LiJiDfCxzPw96p9X\nOVPN5OdiE7AyIh5qfAjzs7M2utk1k3PxNeA9EfEK8LfArbM0tvnopN87W31EVLMkIn4euIl6OViq\nO4Dma8JnchCcSAW4HPgFYAnwSEQ8kpnPze2w5sQvAk9k5i9ExIXUP5B6aWYOzfXAFoK5CIFBYH3T\n8rrGusltzj9BmzPBTM4FEXEpsAW4NjOPVwouZDM5F1cC90ZEUL/2e11EjGXmA7M0xtkyk3OxA3gj\nM4eB4Yj4PvD3qF8/P5PM5FzcBPwWQGb+NCJeAN4F/N9ZGeH8ctLvnXNxOehx4B0RsSEiuoFPUf/Q\nWbMHgF8GiIirgb3Z+I6iM8wJz0VErAf+FPhsZv50DsY4W054LjLz7Y3pAur3Bf7lGRgAMLN/I/cD\nH4yIzsYHMt8PPD3L45wNMzkXLwH/AKBx/XsT8PysjnJ2BdNXwSf93jnrlUBmViPiFuAvqYfQ3Zn5\ndETcXN+cWzLzOxFxfUQ8BxygnvRnnJmcC+DfASuB3238BnxG/n8MMzwXE3aZ9UHOkhn+G9keEX8B\nbAOqwJbM/MkcDvu0mOHPxZeBP2x6bPLfZObuORryaRUR3wT6gVUR8TPgNqCbFt47/bCYJBXM7/iX\npIIZApJUMENAkgpmCEhSwQwBSSqYISBJBTMEJKlghoAkFez/A473i40bdjO3AAAAAElFTkSuQmCC\n",
"text/plain": [
"<matplotlib.figure.Figure at 0x7f6e01cce710>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"plt.plot(LR_vals_x, LR_vals_y_norm)\n",
"plt.plot(LR_vals_x, LR_vals_y_norm_prime)\n",
"#plt.plot(LR_vals_x, exponential_model_norm)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Plot the function and plot some residuals diagnostics stuff!"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"#count, bins, ignored = plt.hist(ss_rdd.map(lambda p: p[1]).collect(), 100, normed=True)\n",
"#fig = plt.figure()\n",
"#ax1 = fig.add_subplot(211)\n",
"#ax1.plot(bins, 1/(sigma * np.sqrt(2 * np.pi)) * np.exp( - (bins - mu)**2 / (2 * sigma**2) ), linewidth=2, color='r')\n",
"# content of the subplot here...\n",
"# SS and DS histograms wiht fit overlay somehow add-ons\n",
"#count2, bins2, ignored2 = plt.hist(ds_rdd.map(lambda p: p[1]).collect(), 100, normed=True)\n",
"#ax2 = fig.add_subplot(212)\n",
"#ax2.plot(bins2, 1/(sigma * np.sqrt(2 * np.pi)) * np.exp( - (bins2 - mu)**2 / (2 * sigma**2) ), linewidth=2, color='r')\n",
"# SS and DS histograms wiht fit overlay somehow categorical\n",
"#plt.tight_layout()\n",
"#fig = plt.gcf()\n",
"#plotly_fig = tls.mpl_to_plotly( fig )\n",
"#plotly_fig['layout']['title'] = 'Simple Subplot Example Title'\n",
"#plotly_fig['layout']['margin'].update({'t':40})\n",
"#fig.show()"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def compute_confusion_matrix(p): return False\n",
"\n",
"def computer_cllr(p):\n",
" return False"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Compute same model using KDE!"
]
},
{
"cell_type": "code",
"execution_count": 142,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"10000\n",
"1239\n"
]
}
],
"source": [
"from pyspark.mllib.stat import KernelDensity\n",
"\n",
"# Construct the density estimator with the sample data and a standard deviation for the Gaussian\n",
"# kernels\n",
"\n",
"kd_ss = KernelDensity()\n",
"kd_ss.setSample(ss_rdd)#.map(lambda p: p[1]))\n",
"kd_ss.setBandwidth(0.1)\n",
"\n",
"kd_ds = KernelDensity()\n",
"kd_ds.setSample(ds_rdd)#.map(lambda p: p[1]))\n",
"kd_ds.setBandwidth(0.1)\n",
"\n",
"LR_vals_y_kde = kd_ss.estimate(LR_vals_x)/kd_ds.estimate(LR_vals_x) # LR right here!\n",
"\n",
"#poly_model_kde = numpy.polyfit(LR_vals_x, LR_vals_y_kde, 3)# fit the 3rd order polynomial, make sure this is ok by exmaining residuals later on!\n",
"### dont forget residul\n",
"#LR_vals_y_kde_prime = numpy.polyval(poly_model_kde, LR_vals_x)# sanity check!\n",
"print len(LR_vals_y_kde)\n",
"print ss_rdd.count()"
]
},
{
"cell_type": "code",
"execution_count": 143,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[<matplotlib.lines.Line2D at 0x7fd32782f210>]"
]
},
"execution_count": 143,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAW8AAAEGCAYAAACqxxSGAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAE4xJREFUeJzt3XuwXWV5x/HvE+6JJICE+01aI4NTROQutpuLA1KKrdKp\nUm8MZZyhXKbVVtqp5ThtRzvTehumIJTSOgOlSmpLVRRUtg5gEOQSkGAECQkkQIQQEpKDITz9Y2/C\n4cDJXufstdbOOvv7mdmzL+fd7/vOmnN+efOud603MhNJUrPMGHQHJEmTZ3hLUgMZ3pLUQIa3JDWQ\n4S1JDWR4S1IDlRreEXFlRDwZEQsLlH1XRPw0IjZExPvG/eyjEbE4In4eER8ps4+SNB2UPfK+Cji5\nYNlHgY8CV4/9MCJ2Bv4WOAI4Crg4IuaU2UlJarpSwzszbwFWjf0sIg6MiBsi4o6I+GFEzOuWXZqZ\n9wPjrxI6GbgxM1dn5rPAjcApZfZTkppu6xrauBz4eGY+HBFHApcCJ26m/N7AsjHvH+9+JknqqjS8\nI2IWcCzw9YiI7sfbVNmmJA2DqkfeM4BVmXnYJL7zONAa834f4OYyOyVJTVdozjsiLoyI+7qPC3oV\n7z7IzDXAIxFxxpi6DpngOy/7LvDuiJjTPXn57u5nkqSunuEdEW8FzgYOBw4FTouIAycoew1wGzAv\nIpZGxFnAHwNnR8Q9EXE/cHq37OERsQw4A7gsIu4DyMxVwN8BdwK3A5/pnriUJHVFr1vCdkfNJ2fm\nOd33fwOMZuY/1dA/SdLrKDJtcj/wrojYOSJmAqcC+1bbLUnS5vQ8YZmZD0bEPwI3AWuBu4GNVXdM\nkjSxntMmr/lCxD8AyzLzsnGfuyWPJE1SZkbvUq9VdLXJ3O7zfsAfANdM0AkfmVx88cUD78OW8PA4\neCw8Fpt/9KPoOu/5EbELsAE4NzOf66tVSVJfCoV3Zv521R2RpGGyaFF/3/d+3hVotVqD7sIWwePw\nCo/FKzwWHaOj/X1/0icsJ6woIsuqS5KmuwUL4JhjgqzyhKUkqVwvvNDf9w1vSRoAw1uSGsjwlqQG\nMrwlqYEMb0lqIMNbkhrI8JakBjK8JamBDG9JaiDDW5IayPCWpAYyvCWpgQxvSWqgWsI7Iv4sIu6P\niIURcXVEbNtfs5I03CoP74jYCzgfOCwzD6Gz+84H+mtWkoZbv+FddA/LrYBZEfESMBNY3l+zkjTc\nKh95Z+Zy4J+BpcDjwLOZ+b3+mpWk4Vb5yDsidgLeC+wPrAaui4gzM/Oa8WVHRkY2vW61Wu5VJ0lj\ntNtt2u020P8GxD33sIyIM4CTM/Oc7vsPA0dl5nnjyrmHpSQVdNxxcOut1e5huRQ4OiK2j4gATgT6\n/DdDkoZbHXPePwGuA+4G7gUCuLy/ZiVpuPUb3j2nTQpX5LSJJBX2lrfA4sXVTptIkkq2fn1/3ze8\nJWkARkf7+77hLUkDYHhLUgMZ3pLUMBs3wosv9leH4S1JNRsdhe23768Ow1uSamZ4S1IDjY7CDjv0\nV4fhLUk1c+QtSQ1keEtSAxnektRA69cb3pLUOI68JamBXG0iSQ3kyFuSGqiW8I6IeRFxd0Tc1X1e\nHREX9NesJA2vMsK75+7xmbkYeDtARMwAHgO+0V+zkjS8BrHa5CTg4cxc1l+zkjS8BjHn/UfAf/bX\npCQNtzJWm/ScNnlZRGwDnA5cNFGZkZGRTa9brRatVquPrknS9NJut2m323z/+7DNNv3VVXj3+Ig4\nHTg3M0+Z4OfuHi9JBXziE7DXXvDJT9aze/wHccpEkvpW2wnLiJhJ52Tlf/fXnCSplqWCAJm5Dpjb\nX1OSJPAKS0lqJO9tIkkN5MhbkhrI8JakBnIzBklqIEfektRAhrckNZCrTSSpgdavh+22668Ow1uS\narZ+Pcya1V8dhrck1WzdOpg5s786DG9JqtGGDZ3nfm8Ja3hLUo3KGHWD4S1JtVq3rv+VJmB4S1Kt\nHHlLUgMZ3pLUQOvX1xjeETEnIr4eEYsi4mcRcVT/TUvS8Clr5F109/gvAd/OzD+MiK2BEpqWpOFT\nW3hHxGzgXZn5MYDMfBF4rv+mJWn41Dnn/SbgVxFxVUTcFRGXR0QJC10kafjUOW2yNXAY8KeZeWdE\nfBG4CLh4fMGRkZFNr1utFq1Wq/8eStI00W63+drX2jzxBIyJyymJzNx8gYjdgR9n5oHd98cBn8rM\n3xtXLnvVJUnD7vOfh8ce6zxHBJkZU6mn57RJZj4JLIuIed2PTgQemEpjkjTs6l5tcgFwdURsA/wS\nOKv/piVp+JS1zrtQeGfmvcAR/TcnScNt3TrYddf+6/EKS0mqkZfHS1IDGd6S1ECGtyQ1kOEtSQ1k\neEtSAxnektRAtd7PW5JUDkfektRAhrckNZC7x0tSw2Q68pakxhkdhW22ga2L3hJwMwxvSarJ2rXw\nhjeUU5fhLUk1MbwlqYGef7688C408xIRS4DVwEvAhsw8spzmJWl4lDnyLjpt/hLQysxV5TQrScNn\nENMmMYmykqTXsXYtzJpVTl1FAzmBmyLijog4p5ymJWm4DGLa5J2ZuSIi5tIJ8UWZeUs5XZCk4VB7\neGfmiu7zyoj4BnAk8JrwHhkZ2fS61WrRarVK6aQkTQf33ttm4cI2Y6JyyiIzN18gYiYwIzPXRsQs\n4EbgM5l547hy2asuSRpmL4f2y88RQWbGVOoqMvLeHfhGRGS3/NXjg1uS1NvatbDHHuXU1TO8M/MR\n4NBympOk4eUVlpLUQIa3JDWQ4S1JDWR4S1IDGd6S1ECGtyQ1kOEtSQ1keEtSAw3iroKSpD5s2NB5\nbL99OfUZ3pJUg5enTGJKdzJ5LcNbkmrw3HMwZ0559RneklSD1asNb0lqnOeeg9mzy6vP8JakGjjy\nlqQGcuQtSQ20evWAwjsiZkTEXRFxfXnNS9JwGORqkwuBB8prWpKGx0CmTSJiH+BU4F/La1qShseg\nTlh+AfgLwO3hJWkKah95R8TvAk9m5j1AdB+SpEko+4Rlz93jgXcCp0fEqcAOwI4R8dXM/Mj4giMj\nI5tet1otWq1WSd2UpGZ77jl45JE2IyPtUuqLzOIzIRHxO8AnMvP01/lZTqYuSRomhx8Ol14KRxzx\nymcRQWZOaTbDdd6SVIOyT1gWmTbZJDN/CPywvOYlaTh4haUkNdDArrCUJE3NCy/Axo2www7l1Wl4\nS1LF1qzpjLrL2kUHDG9Jqtyzz5Z7shIMb0mq3DPPwC67lFun4S1JFVu1Cnbeudw6DW9Jqpgjb0lq\nIEfektRAq1Y58pakxnnmGUfektQ4jrwlqYEceUtSAznylqQGcuQtSQ3kyFuSGqiKkXfPzRgiYjvg\nR8C23fLXZeZnyu2GJE1Po6Pw4oswa1a59fYM78x8ISKOz8x1EbEVcGtE3JCZPym3K5I0/bx8dWWZ\nt4OFgtMmmbmu+3I7OoHvTsOSVEAV891QMLwjYkZE3A08AdyUmXeU3xVJmn6efrr8+W4ouAFxZr4E\nvD0iZgP/ExEHZ+YD48uNjIxset1qtWi1WiV1U5KaaeVKmDu387rdbtNut0upNzInNwMSEZ8Gns/M\nz4/7PCdblyRNd1/5Ctx5J1xxxWt/FhFk5pRmw3tOm0TErhExp/t6B+DdwINTaUyShs3YkXeZikyb\n7An8R0TMoBP2/5WZ3y6/K5I0/axcCQccUH69RZYK3gccVn7TkjT9PfUUHHFE+fV6haUkVWjlStht\nt/LrNbwlqUJVzXkb3pJUoarCe9JLBSesyKWCkvQqmbDddrBmTed5vEqXCkqSpmb1athhh9cP7n4Z\n3pJUkaeeqmbKBAxvSapMVfPdYHhLUmUMb0lqoBUrYM89q6nb8JakiixfDnvtVU3dhrckVWTFCsNb\nkhrHkbckNdDy5c55S1LjVDny9vJ4SarAr38Ns2bB6ChstdXrl/HyeEnawjzxBOy++8TB3a8i26Dt\nExE/iIifRcR9EXFBNV2RpOmjyjXeUGwbtBeBP8/MeyLiDcBPI+LGzHQfS0maQJXz3VBg5J2ZT2Tm\nPd3Xa4FFwN7VdUmSmu/xxwcc3mNFxAHAocDtVXRGkqaLpUthv/2qq7/ItAkA3SmT64ALuyPw1xgZ\nGdn0utVq0Wq1+uyeJDXTo4/CoYe++rN2u0273S6l/kJLBSNia+CbwA2Z+aUJyrhUUJK6jjoKvvAF\nOPbYicvUsVTw34AHJgpuSdKrPfoo7L9/dfX3HHlHxDuBHwH3Adl9/HVmfmdcOUfekkTnwpyddoJ1\n62DGZobI/Yy8e855Z+atQEXLzCVp+lm6FPbZZ/PB3S+vsJSkki1ZAgccUG0bhrcklazq+W4wvCWp\ndEuWGN6S1DgPPQRvfnO1bRjeklSyxYth3rxq2/B+3pJUopdegh137NxVcPbszZf1ft6StIVYvrwT\n2r2Cu1+GtySVqI4pEzC8JalUhrckNZDhLUkN9MADcNBB1bdjeEtSiRYuhLe9rfp2DG9JKsnKlZ07\nCe67b/VtGd6SVJKFC+GQQyCmtHJ7cgxvSSpJXVMmYHhLUmleHnnXoWd4R8SVEfFkRCyso0OS1FR3\n3fXaTYerUmQbtOOAtcBXM3PCf1O8t4mkYbZmDeyxB6xaBdtuW+w7ld7bJDNvAVZNpXJJGhZ33tkZ\ndRcN7n455y1JJViwAI4+ur72em5APBkjIyObXrdaLVqtVpnVS9IWa8EC+NCHNl+m3W7TbrdLaa/Q\n/bwjYn/g/5zzlqTXeukl2G03uPde2Hvv4t+r437e0X1Iksa55x6YO3dywd2vIksFrwFuA+ZFxNKI\nOKv6bklSc3zve3DSSfW22XPOOzPPrKMjktRUN90E551Xb5vuYSlJfVi7FvbaC5YtgzlzJvdd97CU\npAG54QY45pjJB3e/DG9J6sP8+fD+99ffrtMmkjRF69fDnnvCz38Ou+8++e87bSJJAzB/fueqyqkE\nd78Mb0maoiuugHPOGUzbTptI0hQsWgTHHw9Ll079ZlROm0hSzT73uc7a7rruIjheqTemkqRh8Mgj\n8K1vwUMPDa4PjrwlaZIuuqgz6t5pp8H1wZG3JE3CD34At98OV1012H448pakglatgrPPhksugZkz\nB9sXV5tIUgEbN8IZZ8C++8KXv1xOnf2sNnHaRJJ6yITzz4dnn4Vrrx10bzoMb0najF//unMhzoMP\ndm79ut12g+5RR6E574g4JSIejIjFEfGpqjslSVuC+++HY4+FNWs6Jypnzx50j15RZCedGcAlwMnA\nW4EPRsRBVXesycraYLTpPA6v8Fi8ognHYskS+PjHodXqPM+fD7NmDbpXr1Zk5H0k8IvMfDQzNwDX\nAu+ttlvN1oRfzjp4HF7hsXjFlngsMmHxYrjsMjjhBHjHO+CNb+zcLfCccyC2wB18i8x57w0sG/P+\nMTqBLkmNsGEDPP88rFwJTz3VeaxY0QnnRYs60yMzZnTuVXLuuXDaabD99oPu9eaVesLytNN6lym6\nmrDJ5R5+GG67rf52t7RyS5bAzTfX3+6WWG7ZMvjOd+pvd0sst2IFXH99de2+8ELnPttjH5mdaY+5\nczu3b91tt87zvHlw8slw8MGw//5b5gh7Ij3XeUfE0cBIZp7SfX8RkJn5j+PKuchbkiZpquu8i4T3\nVsDPgROBFcBPgA9m5qKpNChJ6l/PaZPM3BgR5wE30jnBeaXBLUmDVdrl8ZKk+kzqxlRFLtaJiC9H\nxC8i4p6IOLScbm55eh2LiDgzIu7tPm6JiN8aRD/rUPQirog4IiI2RMT76uxfnQr+jbQi4u6IuD8i\nJjil23wF/kZmR8T13ay4LyI+NoBu1iIiroyIJyNi4WbKTC47M7PQg07QPwTsD2wD3AMcNK7Me4Bv\ndV8fBSwoWn+THgWPxdHAnO7rU4b5WIwp933gm8D7Bt3vAf5ezAF+Buzdfb/roPs9wGPxV8BnXz4O\nwNPA1oPue0XH4zjgUGDhBD+fdHZOZuRd5GKd9wJfBcjM24E5ETGAfZUr1/NYZOaCzFzdfbuAznr5\n6ajoRVznA9cBT9XZuZoVORZnAvMz83GAzPxVzX2sS5FjkcCO3dc7Ak9n5os19rE2mXkLsGozRSad\nnZMJ79e7WGd8II0v8/jrlJkOihyLsf4EuKHSHg1Oz2MREXsBv5+ZlwINWkk7aUV+L+YBu0TEzRFx\nR0R8uLbe1avIsbgEODgilgP3AhfW1Lct0aSz07sKViwijgfOovPfpmH1RWDsnOd0DvBetgYOA04A\nZgE/jogfZ+YAd0McmJOBuzPzhIj4DeCmiDgkM9cOumNNMJnwfhzYb8z7fbqfjS+zb48y00GRY0FE\nHAJcDpySmZv7L1OTFTkWhwPXRkTQmdt8T0RsyMzra+pjXYoci8eAX2XmKDAaET8C3kZnfng6KXIs\nzgI+C5CZD0fEI8BBwJ219HDLMunsnMy0yR3Ab0bE/hGxLfABYPwf3/XAR2DTlZnPZuaTk2ijKXoe\ni4jYD5gPfDgzHx5AH+vS81hk5oHdx5vozHufOw2DG4r9jfwvcFxEbBURM+mcnJqO100UORaPAicB\ndOd35wG/rLWX9Qom/l/npLOz8Mg7J7hYJyI+3vlxXp6Z346IUyPiIeB5Ov+yTjtFjgXwaWAX4F+6\nI84NmTntbuhV8Fi86iu1d7ImBf9GHoyI7wILgY3A5Zn5wAC7XYmCvxd/D/z7mOVzf5mZzwyoy5WK\niGuAFvDGiFgKXAxsSx/Z6UU6ktRA7h4vSQ1keEtSAxnektRAhrckNZDhLUkNZHhLUgMZ3pLUQIa3\nJDXQ/wMBf2B68uL6egAAAABJRU5ErkJggg==\n",
"text/plain": [
"<matplotlib.figure.Figure at 0x7fd3279997d0>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"plt.plot(LR_vals_x, LR_vals_y_kde)\n",
"#plt.plot(LR_vals_x, LR_vals_y_kde_prime)\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### define addon_donners subset\n",
"\n",
"### define the mapping to get a ranked list of ranked paired users\n",
"\n",
"### weight the addons by their donner's LR value\n",
"\n",
"### aggregate (sum) and deduplicate to addon-LRagg RDD then sort\n",
"\n",
"### recomend top n addons"
]
},
{
"cell_type": "code",
"execution_count": 144,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"NEW_USER_ID = \"ff1d3378-366d-475d-84e9-9f282058912e\""
]
},
{
"cell_type": "code",
"execution_count": 145,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"def calcKDE_LR(ss_model, ds_model, score):\n",
" return (ss_model.estimate(score)/ds_model.estimate(score))\n",
" # this can be faster if we pre-fir the KDE stuff with a polynomial or exponential model that is parametric and callable\n",
"\n",
"def generate_one_versus_rest_comparison(rdd_donnors, new_user):\n",
" newUSRnew_user = new_user.collect()\n",
" comp_rdd = rdd_donnors.map(lambda p: ( (p[0],p[1]),newUSRnew_user[0],newUSRnew_user[1]) ) \n",
" return comp_rdd"
]
},
{
"cell_type": "code",
"execution_count": 146,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"arbitrary_new_user_addons = subset_addon_rdd.filter(lambda p: p['client_id'] == NEW_USER_ID)\n",
"arbitrary_new_user_cat = subset_cat_rdd.filter(lambda p: isinSetCat(p[0])).map(lambda p: tokenize_catVar(p))\n",
"compRDD_newUser = generate_one_versus_rest_comparison(new_subset_cat_tokenized_rdd, arbitrary_new_user_cat)"
]
},
{
"cell_type": "code",
"execution_count": 147,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"#newSIMS = \n",
"new_sims = compRDD_newUser.map(lambda p: computeCosSim(p)).cache()"
]
},
{
"cell_type": "code",
"execution_count": 148,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[<matplotlib.lines.Line2D at 0x7fd319760290>]"
]
},
"execution_count": 148,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAXoAAAEGCAYAAABrQF4qAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAEdtJREFUeJzt3X2MZXV9x/H3Z1lBF3kyKlZRkBo0aiisCka03ooIasW2\n0fgUQTCmqVqMbSzYJ8Y0tdrUqkmrhhSptaBWFEUjFYy9rc9Pu1ueHxTqIsqqEaHgE4Vv/7iHdWZk\nuXeYe+7M/Ob9SiZz7plzz/l+9+5+7tnf+d05qSokSe3asNIFSJL6ZdBLUuMMeklqnEEvSY0z6CWp\ncQa9JDVuqkGf5MwkO5JcPMG2T0vyjSS3J/m9RT87McnVSa5KcsI0a5Sk9WbaZ/RnAcdOuO23gROB\ns+evTLIf8JfAk4AjgdOT7DPNIiVpPZlq0FfV54Gb5q9LcnCSC5J8Lcl/Jjmk23Z7VV0KLP7E1rHA\nhVV1c1X9GLgQOG6adUrSerJxBsc4A/j9qvpWkiOAdwNH38P2DwOun/f4hm6dJOle6DXok+wJPAX4\ncJJ0q+/T5zElSQv1fUa/AbipqjYv4Tk3AIN5jw8A/mOaRUnSejLRGH2S1yW5pPs6Zdzm3RdV9b/A\ndUleMG9fh+7iOXf5NHBMkn26C7PHdOskSffC2KBP8jjglcATgcOA305y8C62PQf4InBIku1JTgJe\nBrwyybYklwLHd9s+Mcn1wAuA9yS5BKCqbgL+Cvg68BXgTd1FWUnSvZBxv6a4Oxs/tqpe1T3+c+Bn\nVfV3M6hPkrRMkwzdXAo8Lcl+STYBzwEe3m9ZkqRpGXsxtqquTPJW4CLgVmArcEffhUmSpmPs0M2v\nPCH5a+D6qnrPovXeqkqSlqiqMn6r5Zl01s2Duu+PAH4XOOfutquqJr9OP/30Fa/B/uxvFv1985sr\n3896ev1mZdJ59B9J8gDgduDVVXVLjzVJWiGPehT87Gewxx4rXYmmaaKgr6rf7LsQSavDDE80NSP+\nPvoJDAaDlS6hV/a3ttmfxlnyxdhd7iipWY45SZq+BH76U7jvfVe6kvUhCbVaLsZKktYug16SGmfQ\nS1LjDHpJapxBL2kB51S0x6CXpMYZ9JLUOINekhpn0EtS4wx6SWqcQS9JjTPoJalxBr2kBZxH3x6D\nXpIaN+mtBF+f5NIkFyc5O8nufRcmSZqOsUGf5KHAHwKbq+pQRnelenHfhUmSpmPSe8buBuyZ5E5g\nE/Dd/kqSJE3T2DP6qvou8DZgO3AD8OOq+kzfhUmSpmPsGX2SfYHnAwcCNwPnJnlpVZ2zeNu5ubmd\ny4PBwHs9StI8w+GQ4XA48+OOvWdskhcAx1bVq7rHLweOrKrXLtrOe8ZKa1wCt90GmzatdCXrw2q6\nZ+x24MlJ7pskwNHAFf2WJWmleL7WnknG6L8KnAtsBf4bCHBGz3VJkqZk7NDNxDty6EZa8xK49VbY\nc8+VrmR9WE1DN5KkNcygl6TGGfSS1DiDXpIaZ9BLUuMMekkLOHmuPQa9JDXOoJekxhn0ktQ4g16S\nGmfQS1LjDHpJapxBL0mNM+glLeA8+vYY9JLUOINekho3NuiTHJJka5It3febk5wyi+IkScu3pDtM\nJdkAfIfRzcGvX/Qz7zAlrXEJ3HIL7LXXSleyPqzWO0w9E/jW4pCXJK1eSw36FwEf6KMQSVI/Nk66\nYZL7AMcDp+1qm7m5uZ3Lg8GAwWCwjNIkqS3D4ZDhcDjz4048Rp/keODVVXXcLn7uGL20xjlGP1ur\ncYz+JThsIzXP87X2TBT0STYxuhD70X7LkSRN20Rj9FX1E+BBPdciSeqBn4yVpMYZ9JLUOINekhpn\n0EtS4wx6SWqcQS9pAefRt8egl6TGGfSS1DiDXpIaZ9BLUuMMeklqnEEvSY0z6CWpcQa9pAWcR98e\ng16SGmfQS1LjJr3D1D5JPpzkiiSXJTmy78IkSdMx0R2mgHcCn6qqFybZCGzqsSZJ0hSlxlx5SbI3\nsLWqfn3MdjVuX5JWtwRuugn23XelK1kfklBV6fs4kwzdPBL4YZKzkmxJckaS+/VdmCRpOiYZutkI\nbAZeU1VfT/IO4DTg9MUbzs3N7VweDAYMBoPpVClJDRgOhwyHw5kfd5Khm/2BL1XVwd3jpwKnVtXz\nFm3n0I20xiXwox/BfvutdCXrw6oZuqmqHcD1SQ7pVh0NXN5rVZKkqZl01s0pwNlJ7gNcC5zUX0mS\npGkaO3Qz8Y4cupHWPIduZmvVDN1IktY2g16SGmfQS1LjDHpJapxBL2kB51S0x6CXpMYZ9JLUOINe\nkhpn0EtS4wx6SWqcQS9JjTPoJalxBr2kBZxH3x6DXpIaZ9BLUuMmuvFIkv8BbgbuBG6vqiP6LEqS\nND2T3mHqTmBQVTf1WYwkafomHbrJEraVJK0ik4Z3ARcl+VqSV/VZkCRpuiYdujmqqr6X5EGMAv+K\nqvp8n4VJkqZjoqCvqu9133+Q5DzgCOBXgn5ubm7n8mAwYDAYTKVISWrBcDhkOBzO/LipMZ+OSLIJ\n2FBVtybZE7gQeFNVXbhouxq3L0mrWwI/+AE88IErXcn6kISqSt/HmeSMfn/gvCTVbX/24pCXJK1e\nY4O+qq4DDptBLZKkHjhlUpIaZ9BLUuMMeklqnEEvSY0z6CWpcQa9pAX8OEx7DHpJapxBL0mNM+gl\nqXEGvSQ1zqCXpMYZ9JLUOINekhpn0EtawHn07THoJalxBr0kNW7ioE+yIcmWJOf3WZAkabqWckb/\nOuDyvgqRJPVjoqBPcgDwHOCf+i1HkjRtk57Rvx14A+D1eElaY8YGfZLnAjuqahuQ7kuStEZsnGCb\no4DjkzwHuB+wV5J/qaoTFm84Nze3c3kwGDAYDKZUpqRZcR59f4bDIcPhcObHTS3hVU3ydOCPq+r4\nu/lZLWVfklafBG68Efbff6UrWR+SUFW9j5I4j16SGrekM/p73JFn9NKa5xn9bHlGL0maCoNekhpn\n0EtS4wx6SWqcQS9pAedUtMegl6TGGfSS1DiDXpIaZ9BLUuMMeklqnEEvSY0z6CWpcQa9pAWcR98e\ng16SGmfQS1LjDHpJatzYe8Ym2QP4L2D3bvtzq+pNfRcmSZqOsUFfVT9P8ltV9ZMkuwFfSHJBVX11\nBvVJkpZpoqGbqvpJt7gHozcHr8tL0hoxUdAn2ZBkK3AjcFFVfa3fsiRJ0zJ26Aagqu4EDk+yN/Cx\nJI+tqssXbzc3N7dzeTAYMBgMplSmpFlxHn1/hsMhw+Fw5sdNLfFVTfIXwG1V9feL1tdS9yVpdUng\nhhvgoQ9d6UrWhyRUVfo+ztihmyQPTLJPt3w/4Bjgyr4LkyRNxyRDN78GvC/JBkZvDB+qqk/1W5Yk\naVommV55CbB5BrVIknrgJ2MlqXEGvSQ1zqCXpMYZ9JLUOINe0gJ+HKY9Br0kNc6gl6TGGfSS1DiD\nXpIaZ9BLUuMMeklqnEEvSY0z6CUt4Dz69hj0ktQ4g16SGmfQS1LjJrmV4AFJPpvksiSXJDllFoVJ\nkqZjklsJ/h/wR1W1Lcn9gW8kubCqvG+sJK0BY8/oq+rGqtrWLd8KXAE8rO/CJEnTsaQx+iQHAYcB\nX+mjGEnS9E0ydANAN2xzLvC67sz+V8zNze1cHgwGDAaDZZYnadacR9+f4XDIcDic+XFTE7yqSTYC\nnwQuqKp37mKbmmRfklavBLZvh4c/fKUrWR+SUFXp+ziTDt28F7h8VyEvSVq9JpleeRTwMuAZSbYm\n2ZLkuP5LkyRNw9gx+qr6ArDbDGqRJPXAT8ZKUuMMeklqnEEvSY0z6CUt4Czp9hj0ktQ4g16SGmfQ\nS1LjDHpJapxBL0mNM+glqXEGvSQ1zqCXtIDz6Ntj0EtS4wx6SWqcQS9JjTPoJalxk9xh6swkO5Jc\nPIuCJEnTNckZ/VnAsX0XIknqx9igr6rPAzfNoBZJUg8co5e0gPPo2zP25uBLMTc3t3N5MBgwGAym\nuXtJWtOGwyHD4XDmx01N8Pad5EDgE1V16D1sU5PsS9LqlcB118FBB610JetDEqoqfR9n0qGbdF+S\npDVmkumV5wBfBA5Jsj3JSf2XJUmalrFj9FX10lkUIknqh7NuJKlxBr0kNc6gl7SAk+faY9BLUuMM\neklqnEEvSY0z6CWpcQa9JDXOoJekxhn0ktQ4g16SGmfQS1rAD0y1x6CXpMYZ9JLUOINekho3UdAn\nOS7JlUmuTnJq30VJkqZnkjtMbQD+ATgWeBzwkiSP6buw1WQlbuY7S/a3ttmfxpnkjP4I4Jqq+nZV\n3Q58EHh+XwXdeSds3drX3u+dpfxFu+MO2Latv1r6sJr/Id12G1xxxfL2sRr6u+UWuPrqpT3nG9+Y\nbLvV0F+fdtXfli2TzRC65BL4xS+mW9NybNs2yolZmiToHwZcP+/xd7p1vbjoIti8ua+99+8Tn4DD\nD1/pKtrx5jfDYx+70lUs3xveAI9+9NKe88QnwrXX9lNPC57wBJjkPe7QQ+Ed7+i9nIkdfji8732z\nPebYe8YuxfOet/x9XHfd9PY1LVddNfnZ1VVXjb6vpvrHWUp/s/bJT46+L+fPczX0d2/7ePaz4ZBD\n7nmbafd38smw117T299y3VN/J58Mj3/8+H2ceip87nPTrWs5XvMaOO+82R0vNeb/PkmeDMxV1XHd\n49OAqqq3LtrOj1lI0hJVVfo+xiRBvxtwFXA08D3gq8BLqmqZI6eSpFkYO3RTVXckeS1wIaMx/TMN\neUlaO8ae0UuS1rZlfzJ2rXyYKskBST6b5LIklyQ5pVu/X5ILk1yV5NNJ9pn3nDcmuSbJFUmeNW/9\n5iQXdz2/Y9763ZN8sHvOl5I8YrZdjj73kGRLkvO7x830l2SfJB/u6r0syZGt9Jfk9Uku7eo6u6tl\nTfeW5MwkO5JcPG/dTHpKcmK3/VVJTphRb3/b1b4tyUeS7L1qequqe/3F6I3im8CBwH2AbcBjlrPP\nvr6AhwCHdcv3Z3Td4THAW4E/6dafCrylW34ssJXR8NZBXZ93/Q/oK8CTuuVPAcd2y38AvKtbfhHw\nwRXo8/XAvwLnd4+b6Q/4Z+CkbnkjsE8L/QEPBa4Fdu8efwg4ca33BjwVOAy4eN663nsC9gO+1f39\n2Peu5Rn09kxgQ7f8FuBvVktvy232ycAF8x6fBpw6i38cU3ihPta9MFcC+3frHgJceXe9ABcAR3bb\nXD5v/YuBd3fL/w4c2S3vBvxgxj0dAFwEDPhl0DfRH7A38K27Wb/m+2MU9N/u/hFvBM5v5e8mo5PA\n+WHYZ0/fX7xN9/jdwIv67m3Rz34HeP9q6W25Qzcz/TDVtCQ5iNG78ZcZ/aXbAVBVNwIP7jZb3NsN\n3bqHMerzLvN73vmcqroD+HGSB/TSxN17O/AGYP6Fl1b6eyTwwyRndUNTZyTZRAP9VdV3gbcB27s6\nb66qz9BAb3fjwT32dHPX0672NUsnMzpD5x7qmVlv6+63Vya5P3Au8LqqupWFocjdPF7W4aa4r3s+\nUPJcYEdVbRtz3DXZH6Mz3c3AP1bVZuA2RmdKa/71S7Ivo18rciCjs/s9k7yMBnqbQHM9Jfkz4Paq\n+sA0d7ucJy836G8A5l/UOaBbtyol2cgo5N9fVR/vVu9Isn/384cA3+/W3wA8fN7T7+ptV+sXPCej\nzx/sXVU/6qGVu3MUcHySa4EPAM9I8n7gxkb6+w5wfVV9vXv8EUbB38Lr90zg2qr6UXf2dh7wFNro\nbbFZ9LRiuZTkFcBzgJfOW73ivS036L8GPCrJgUl2ZzR+dP4y99mn9zIaE3vnvHXnA6/olk8EPj5v\n/Yu7q9+PBB4FfLX77+bNSY5IEuCERc85sVt+IfDZ3jpZpKr+tKoeUVUHM3odPltVLwc+QRv97QCu\nT3LXLwQ4GriMNl6/7cCTk9y3q+lo4HLa6C0sPBudRU+fBo7JaJbWfsAx3bppW9BbkuMYDZ0eX1U/\nn7fdyvc2hQsSxzGawXINcNq0L3hM8cLJUcAdjGYGbQW2dLU/APhM18OFwL7znvNGRlfIrwCeNW/9\nE4BLup7fOW/9HsC/deu/DBy0Qr0+nV9ejG2mP+A3GJ1cbAM+ymjmQRP9Aad3dV4MvI/RLLY13Rtw\nDvBd4OeM3sxOYnTBufeeGL2ZXANcDZwwo96uYXRRfUv39a7V0psfmJKkxq27i7GStN4Y9JLUOINe\nkhpn0EtS4wx6SWqcQS9JjTPoJalxBr0kNe7/AXK+G7HqBmQGAAAAAElFTkSuQmCC\n",
"text/plain": [
"<matplotlib.figure.Figure at 0x7fd328916050>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"#numerator_vals = new_sims.map(lambda p: kd_ss.estimate(p[1]))\n",
"sim_dict_new_user = new_sims.filter(lambda p: p[0][0] != NEW_USER_ID).map(lambda p: (p[0][0], p[1])).collectAsMap()\n",
"#print lrs_new_user.take(10)\n",
"\n",
"LR_vals_new_user = kd_ss.estimate(sim_dict_new_user.values())/kd_ds.estimate(sim_dict_new_user.values()) # LR right here!\n",
"plt.plot(LR_vals_new_user)"
]
},
{
"cell_type": "code",
"execution_count": 149,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"LR_DICT = dict(zip(sim_dict_new_user.keys(),LR_vals_new_user))"
]
},
{
"cell_type": "code",
"execution_count": 150,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"def assignLRval(p, Dictonary_of_lrs):\n",
" outlist = []\n",
" for i in p[1]:\n",
" outlist.append((i,log(Dictonary_of_lrs[p[0]])))\n",
" return outlist\n",
"\n",
"def getAddonNames(p, n):\n",
" st_a = \"http://addons.mozilla.org/api/v3/addons/addon/\";\n",
" st_c= \"/\"\n",
" #print p[0][0]\n",
" recomendations = []\n",
" for i in p: \n",
" st_b = str(i[0])\n",
" url = st_a + st_b + st_c\n",
"# print url\n",
" try:\n",
" r = urllib2.urlopen(url)\n",
" data = json.load(r) \n",
" recomendations.append(data['name'].values()[0])\n",
"# recomendations.append(data)\n",
" #data = json.loads(r.read().decode(r.info().get_param('charset') or 'utf-8'))\n",
" except IOError:\n",
" True\n",
" #print \"addon information unreachable\"\n",
"# print recomendations\n",
" return recomendations[1:n]"
]
},
{
"cell_type": "code",
"execution_count": 152,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"donnor_ids = [x[0] for x in LR_DICT.items()]\n",
"donnor_lrs = [log(x[1]) for x in LR_DICT.items()]\n",
"final_ranking_of_addons = subset_addon_tok_filtered_rdd.filter(lambda p: p[0] in donnor_ids).flatMap(lambda p: assignLRval(p, LR_DICT)).foldByKey(0,add).cache()"
]
},
{
"cell_type": "code",
"execution_count": 153,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[u'Skype',\n",
" u'\\u7f51\\u9875\\u622a\\u56fe',\n",
" u'Video DownloadHelper',\n",
" u'\\u0412\\u0438\\u0437\\u0443\\u0430\\u043b\\u044c\\u043d\\u044b\\u0435 \\u0437\\u0430\\u043a\\u043b\\u0430\\u0434\\u043a\\u0438 \\u043e\\u0442 \\u042f\\u043d\\u0434\\u0435\\u043a\\u0441',\n",
" u'Yandex.Bar (\\u042f\\u043d\\u0434\\u0435\\u043a\\u0441 \\u0411\\u0430\\u0440)',\n",
" u'Firebug',\n",
" u'\\u0421\\u043e\\u0432\\u0435\\u0442\\u043d\\u0438\\u043a \\u042f\\u043d\\u0434\\u0435\\u043a\\u0441.\\u041c\\u0430\\u0440\\u043a\\u0435\\u0442\\u0430',\n",
" u'Download YouTube Videos as MP4',\n",
" u'DownThemAll!',\n",
" u'Assistant Amazon',\n",
" u'uBlock Origin',\n",
" u'Adblock Plus Pop-up Addon',\n",
" u'Flash Video Downloader - YouTube HD Download [4K]',\n",
" u'Greasemonkey']"
]
},
"execution_count": 153,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"addon_recommendations = final_ranking_of_addons.takeOrdered(100, key=lambda x: -x[1])\n",
"getAddonNames(addon_recommendations, 15)"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": true
},
"source": [
"## Next steps?"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.11"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment