SamPenrose/Search comparison.ipynb

## Search comparison.ipynb
{"nbformat_minor": 0, "cells": [{"execution_count": 1, "cell_type": "code", "source": "import ujson as json", "outputs": [], "metadata": {"collapsed": true, "trusted": true}}, {"execution_count": 2, "cell_type": "code", "source": "pairs_dir_path = 's3n://net-mozaws-prod-us-west-2-pipeline-analysis/bcolloran/mergedDataPerClient/' \\\n                 'mreidDataThrough2015-08-04/2015-08-06/17033clients/'", "outputs": [], "metadata": {"collapsed": true, "trusted": true}}, {"execution_count": 3, "cell_type": "code", "source": "fraction = pairs_dir_path + 'part-000*' # This turns out to be all 17,033 clients.\nf = sc.sequenceFile(fraction)", "outputs": [], "metadata": {"collapsed": true, "trusted": true}}, {"execution_count": 4, "cell_type": "code", "source": "all_pairs = f.mapValues(json.loads)", "outputs": [], "metadata": {"collapsed": true, "trusted": true}}, {"execution_count": 5, "cell_type": "code", "source": "from collections import defaultdict\ndef get_overlap(pair, v2_extractor=None, v4_extractor=None):\n    v2_blobs = pair['v2'].get('data', {}).get('days', {}) # {'YYYY-MM-DD': dict}\n    v4_blobs = pair['v4'] # [{'creationDate': 'YYYY-MM-DD:...', 'k': val, ...}, ...]\n    # One blob per date in v2, multiple per date in v4\n    results = {'v2': {}, 'v4': defaultdict(list)}\n    if not (v2_blobs and v4_blobs):\n        return results\n\n    v2_dates = v2_blobs.keys()\n    v2_dates.sort()\n    v4_blobs.sort(key=lambda d: d['creationDate']) # probably redundant\n    v2_start, v2_end = v2_dates[0], v2_dates[-1] # possibly same\n    start = end = None\n\n    # Find overlap and walk v4 at same time.\n    for v4 in v4_blobs:\n        v4_date = v4['creationDate'][:10]\n        # Walk start up as far as we must.\n        if v4_date < v2_start:\n            # If v2 is entirely after v4, we never get past here.\n            continue\n        elif not start:\n            start = v4_date\n        # We have at least one overlapping date.\n        if v4_date <= v2_end:\n            # Walk end up as far as we can.\n            end = v4_date\n        else:\n            break\n        value = v4_extractor(v4) if v4_extractor else v4\n        results['v4'][v4_date].append(value)\n    if end is None: # We never reached last line of the loop.\n        return results\n\n    for v2_date in v2_dates:\n        if v2_date < start:\n            continue\n        if v2_date > end:\n            break\n        value = v2_extractor(v2_blobs[v2_date]) if v2_extractor else v2_blobs[v2_date]\n        results['v2'][v2_date] = value\n    return results", "outputs": [], "metadata": {"collapsed": true, "trusted": true}}, {"execution_count": 6, "cell_type": "code", "source": "import numbers\ndef v2_search_extractor(d):\n    total = 0\n    search_dict = d.get('org.mozilla.searches.counts', {})\n    counts = [v for k, v in search_dict.items() if k != '_v'\n              and isinstance(v, numbers.Number)]\n    total += sum(counts)\n    return int(total)\ndef v4_search_extractor(d):\n    '''\n    Count searches in v4_pings.\n    '''\n    total = 0\n    hists = d.get('payload/keyedHistograms/SEARCH_COUNTS')\n    if hists is None:\n        try:\n            hists = d.get('payload', {}).get('keyedHistograms', {}).get('SEARCH_COUNTS', {})\n        except AttributeError: # float sneaking in in Nightly somewhere\n            return 0\n    if hists == 'MISSING':\n        return 0\n    for blob in hists.values():\n        total += blob.get('sum', 0)\n    return total", "outputs": [], "metadata": {"collapsed": true, "trusted": true}}, {"execution_count": 7, "cell_type": "code", "source": "search_overlap = all_pairs.map(lambda (clientId, d): get_overlap(d, v2_search_extractor, v4_search_extractor))", "outputs": [], "metadata": {"collapsed": true, "trusted": true}}, {"execution_count": 8, "cell_type": "code", "source": "def search_combiner(d):\n    '''\n    v2 has one blob per date; v4 has multiple. sum() the latter.\n    '''\n    v2_total = 0\n    v4_total = 0\n    for date in d['v2']:\n        v2_total += d['v2'][date]\n        v4_total += sum(d['v4'][date])\n    return (v2_total, v4_total)", "outputs": [], "metadata": {"collapsed": true, "trusted": true}}, {"execution_count": 10, "cell_type": "code", "source": "combined = search_overlap.map(search_combiner)\ncombined.first()", "outputs": [{"execution_count": 10, "output_type": "execute_result", "data": {"text/plain": "(10, 10)"}, "metadata": {}}], "metadata": {"collapsed": false, "trusted": true}}, {"execution_count": 13, "cell_type": "code", "source": "v2c = combined.map(lambda tup: tup[0])\nv4c = combined.map(lambda tup: tup[1])\nv2r = v2c.collect()\nv4r = v4c.collect()", "outputs": [], "metadata": {"collapsed": true, "trusted": true}}, {"source": "For these 17,033 paired windows, we have 284,097 v2 searches and 293,528 v4.\nWhich is 17 searches per user per active window in this nightly population.\nAnd also an extra 3.3% in the v4 half of the paired windows.", "cell_type": "markdown", "metadata": {}}, {"execution_count": 35, "cell_type": "code", "source": "sum(v2r), sum(v4r)", "outputs": [{"execution_count": 35, "output_type": "execute_result", "data": {"text/plain": "(284097, 293528)"}, "metadata": {}}], "metadata": {"collapsed": false, "trusted": true}}, {"execution_count": 11, "cell_type": "code", "source": "import plotly.plotly as PL\nimport plotly.graph_objs as GO", "outputs": [], "metadata": {"collapsed": true, "trusted": true}}, {"execution_count": 14, "cell_type": "code", "source": "trace = GO.Scatter(x=v2r, y=v4r, mode='markers', marker=GO.Marker(opacity=0.1))\ndata = GO.Data([trace])\nlayout = GO.Layout(\n    xaxis=GO.XAxis(\n        range=[0, 150]\n    ),\n    yaxis=GO.YAxis(\n        range=[0, 150]\n    )\n)\nfigure = GO.Figure(data=data, layout=layout)", "outputs": [], "metadata": {"collapsed": false, "trusted": true}}, {"source": "First we scatter-graph them. Unfortunately, an opacity of 0.1%\nis not nearly enough to show relative density when 17,000 values\nare clustered mostly in a narrow band about 20x100 pixels.", "cell_type": "markdown", "metadata": {}}, {"execution_count": 15, "cell_type": "code", "source": "PL.iplot(figure, filename='searches-X-v2-Y-v4-08-06')", "outputs": [{"execution_count": 15, "output_type": "execute_result", "data": {"text/plain": "<plotly.tools.PlotlyDisplay object>", "text/html": "<iframe id=\"igraph\" scrolling=\"no\" style=\"border:none;\"seamless=\"seamless\" src=\"https://plot.ly/~mozilla/221.embed\" height=\"525\" width=\"100%\"></iframe>"}, "metadata": {}}], "metadata": {"collapsed": true, "trusted": true}}, {"source": "Brendan's suggestion was to calculate the per-client difference\nfor each day and examine as a histogram. I accidentally used\n\"combined\" above (which is per-client, not per-client-day)\nand found very close incidence. When I reran with per-client-day,\nPlotly choked on the resulting 89,247 values data.", "cell_type": "markdown", "metadata": {}}, {"execution_count": 44, "cell_type": "code", "source": "def combine_by_day(d):\n    days = d['v2'].keys()\n    result = {}\n    for date in days:\n        result[date] = (d['v2'][date], sum(d['v4'][date]))\n    return result\nsof = search_overlap.first()\nprint sof\ncombine_by_day(sof)", "outputs": [{"output_type": "stream", "name": "stdout", "text": "{'v2': {u'2015-07-27': 1, u'2015-08-01': 2, u'2015-08-02': 0, u'2015-07-30': 0, u'2015-07-31': 5, u'2015-07-29': 1, u'2015-07-28': 1}, 'v4': defaultdict(<type 'list'>, {u'2015-07-27': [1], u'2015-08-01': [2, 0, 0], u'2015-08-02': [0], u'2015-07-30': [0, 0], u'2015-07-31': [3, 1, 1, 0], u'2015-07-29': [0, 0, 1, 0], u'2015-07-28': [1]})}\n"}, {"execution_count": 44, "output_type": "execute_result", "data": {"text/plain": "{u'2015-07-27': (1, 1),\n u'2015-07-28': (1, 1),\n u'2015-07-29': (1, 1),\n u'2015-07-30': (0, 0),\n u'2015-07-31': (5, 5),\n u'2015-08-01': (2, 2),\n u'2015-08-02': (0, 0)}"}, "metadata": {}}], "metadata": {"collapsed": false, "trusted": true}}, {"execution_count": 46, "cell_type": "code", "source": "BOUND = 10\ndef delta(d, base='v2', bound=BOUND):\n    results = []\n    for date, (v2, v4) in d.items():\n        source, alternate = v2, v4\n        if base == 'v4':\n            source, alternate = v4, v2\n        difference = source - alternate\n        divisor = float(source or alternate or 1.0)\n        value = round(difference/divisor, 1)\n        if value > bound:\n            value = bound\n        elif value < -bound:\n            value = -bound\n        results.append(value)\n    return results\n\nv2_delta = search_overlap.map(combine_by_day).map(delta)\n# v4_delta = search_overlap.map(lambda d: delta(sum(d['v4']), d['v2']))\nv2_delta.take(10)", "outputs": [{"execution_count": 46, "output_type": "execute_result", "data": {"text/plain": "[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],\n [0.0, 0.0],\n [0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],\n [0.0, 0.0, 0.0, 0.0, 0.0, 0.0],\n [0.5, -1.0],\n [0.0],\n [0.0],\n [],\n [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0],\n [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]"}, "metadata": {}}], "metadata": {"collapsed": false, "trusted": true}}, {"execution_count": 47, "cell_type": "code", "source": "v2_delta_collected = v2_delta.collect()", "outputs": [], "metadata": {"collapsed": false, "trusted": true}}, {"execution_count": 61, "cell_type": "code", "source": "v2_delta_hist = GO.Data([GO.Histogram(x=all89247)])\nPL.iplot(v2_delta_hist, filename='searches-v2-delta-hist-08-06',\n         layout=GO.Layout(xaxis=GO.XAxis(range=[-BOUND, BOUND])))", "outputs": [{"output_type": "stream", "name": "stderr", "text": "/home/hadoop/anaconda/lib/python2.7/site-packages/plotly/plotly/plotly.py:187: UserWarning:\n\nWoah there! Look at all those points! Due to browser limitations, Plotly has a hard time graphing more than 500k data points for line charts, or 40k points for other types of charts. Here are some suggestions:\n(1) Trying using the image API to return an image instead of a graph URL\n(2) Use matplotlib\n(3) See if you can create your visualization with fewer data points\n\nIf the visualization you're using aggregates points (e.g., box plot, histogram, etc.) you can disregard this warning.\n\n"}, {"execution_count": 61, "output_type": "execute_result", "data": {"text/plain": "<plotly.tools.PlotlyDisplay object>", "text/html": "<iframe id=\"igraph\" scrolling=\"no\" style=\"border:none;\"seamless=\"seamless\" src=\"https://plot.ly/~mozilla/223.embed\" height=\"525\" width=\"100%\"></iframe>"}, "metadata": {}}], "metadata": {"collapsed": false, "trusted": true}}, {"execution_count": 60, "cell_type": "code", "source": "all89247 = []\nfor l in v2_delta_collected:\n    all89247.extend(l)\nall89247.sort()\nprint len(all89247)\nfor i in range (246, 90000, 1000):\n    if i >= len(all89247):\n        print \"done at:\", i\n    else:\n        print all89247[i],", "outputs": [{"output_type": "stream", "name": "stdout", "text": "89247\n-8.5 -2.0 -1.0 -1.0 -1.0 -1.0 -1.0 -0.6 -0.3 -0.1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.1 0.2 0.3 0.5 0.6 0.8 1.0 1.0 1.0 1.0\n"}], "metadata": {"collapsed": false, "trusted": true}}, {"source": "Here we look at the count without trying to match windows.\nI think the enormous v2 number just means that nightly clients\nhave a much longer tail of v2 activity, which we would expect.\nThe way to test it would be to match the starts but not the\nends of the pairs.", "cell_type": "markdown", "metadata": {}}, {"execution_count": 38, "cell_type": "code", "source": "def walk_pair(pair, v2_extractor, v4_extractor):\n    '''\n    Like get_overlap, except run the extractors on all\n    activity blobs in both versions.\n    '''\n    v2_blobs = pair['v2'].get('data', {}).get('days', {}) # {'YYYY-MM-DD': dict}\n    v4_blobs = pair['v4'] # [{'creationDate': 'YYYY-MM-DD:...', 'k': val, ...}, ...]\n    # One blob per date in v2, multiple per date in v4\n    results = {'v2': {}, 'v4': defaultdict(list)}\n    if not (v2_blobs and v4_blobs):\n        return results\n\n    v2_dates = v2_blobs.keys()\n    v2_dates.sort()\n    v4_blobs.sort(key=lambda d: d['creationDate']) # probably redundant\n    for v4 in v4_blobs:\n        v4_date = v4['creationDate'][:10]\n        value = v4_extractor(v4) if v4_extractor else v4\n        results['v4'][v4_date].append(value)\n\n    for v2_date in v2_dates:\n        value = v2_extractor(v2_blobs[v2_date]) if v2_extractor else v2_blobs[v2_date]\n        results['v2'][v2_date] = value\n    return results\nwalk_pair(all_pairs.first()[1], v2_search_extractor, v4_search_extractor)", "outputs": [{"execution_count": 38, "output_type": "execute_result", "data": {"text/plain": "{'v2': {u'2015-06-13': 0,\n  u'2015-06-15': 16,\n  u'2015-06-16': 21,\n  u'2015-06-17': 2,\n  u'2015-06-18': 3,\n  u'2015-06-19': 10,\n  u'2015-06-20': 8,\n  u'2015-06-21': 11,\n  u'2015-06-22': 16,\n  u'2015-06-23': 4,\n  u'2015-06-24': 14,\n  u'2015-06-25': 5,\n  u'2015-06-26': 24,\n  u'2015-06-27': 7,\n  u'2015-06-28': 2,\n  u'2015-06-29': 3,\n  u'2015-06-30': 11,\n  u'2015-07-01': 19,\n  u'2015-07-02': 23,\n  u'2015-07-03': 10,\n  u'2015-07-04': 20,\n  u'2015-07-05': 6,\n  u'2015-07-06': 5,\n  u'2015-07-07': 2,\n  u'2015-07-08': 3,\n  u'2015-07-09': 2,\n  u'2015-07-10': 4,\n  u'2015-07-11': 1,\n  u'2015-07-12': 10,\n  u'2015-07-13': 3,\n  u'2015-07-14': 2,\n  u'2015-07-15': 18,\n  u'2015-07-16': 1,\n  u'2015-07-17': 9,\n  u'2015-07-18': 1,\n  u'2015-07-19': 30,\n  u'2015-07-20': 0,\n  u'2015-07-21': 1,\n  u'2015-07-22': 6,\n  u'2015-07-23': 0,\n  u'2015-07-24': 0,\n  u'2015-07-25': 6,\n  u'2015-07-26': 1,\n  u'2015-07-27': 1,\n  u'2015-07-28': 1,\n  u'2015-07-29': 1,\n  u'2015-07-30': 0,\n  u'2015-07-31': 5,\n  u'2015-08-01': 2,\n  u'2015-08-02': 0},\n 'v4': defaultdict(<type 'list'>, {u'2015-07-30': [0, 0], u'2015-08-04': [4], u'2015-08-01': [2, 0, 0], u'2015-08-02': [0], u'2015-08-03': [4, 2, 9, 3], u'2015-07-27': [1], u'2015-07-31': [3, 1, 1, 0], u'2015-07-29': [0, 0, 1, 0], u'2015-07-28': [1]})}"}, "metadata": {}}], "metadata": {"collapsed": true, "trusted": true}}, {"execution_count": 40, "cell_type": "code", "source": "untrimmed_search_totals = all_pairs.map(lambda (clientId, d): walk_pair(d, v2_search_extractor, v4_search_extractor))\nuntrimmed_combined = untrimmed_search_totals.map(search_combiner)\nraw_v2_totals = untrimmed_combined.map(lambda tup: tup[0]).sum()\nraw_v4_totals = untrimmed_combined.map(lambda tup: tup[1]).sum()\nprint raw_v2_totals, raw_v4_totals", "outputs": [{"output_type": "stream", "name": "stdout", "text": "4135340 293528\n"}], "metadata": {"collapsed": false, "trusted": true}}], "nbformat": 4, "metadata": {"kernelspec": {"display_name": "Python 2", "name": "python2", "language": "python"}, "language_info": {"mimetype": "text/x-python", "nbconvert_exporter": "python", "version": "2.7.9", "name": "python", "file_extension": ".py", "pygments_lexer": "ipython2", "codemirror_mode": {"version": 2, "name": "ipython"}}}}