SamPenrose/Lost subsession estimates.ipynb

## Lost subsession estimates.ipynb
{"nbformat_minor": 0, "cells": [{"execution_count": 1, "cell_type": "code", "source": "from moztelemetry import get_pings, get_pings_properties", "outputs": [], "metadata": {"collapsed": true, "trusted": true}}, {"execution_count": 2, "cell_type": "code", "source": "build_ids = (\"20150722000000\", \"20150729999999\")\nmain_pings = get_pings(sc,\n                       app=\"Firefox\",\n                       channel=\"nightly\",\n                       build_id=build_ids,\n                       doc_type=\"main\",\n                       schema=\"v4\")", "outputs": [], "metadata": {"collapsed": true, "trusted": true}}, {"execution_count": 3, "cell_type": "code", "source": "import binascii\ntenth = main_pings.filter(lambda d: binascii.crc32((d or {}).get(\"clientId\", 'a')) % 100 < 10) # 'a' -> 11\ntenth_total = tenth.count()\nprint tenth_total # 128,543 pings on 3 Aug; 131,326 on 4 Aug; 132,427 on 5 Aug", "outputs": [{"output_type": "stream", "name": "stdout", "text": "132720\n"}], "metadata": {"collapsed": false, "trusted": true}}, {"execution_count": 4, "cell_type": "code", "source": "subset_10 = get_pings_properties(tenth, [\"clientId\",\n                                         \"meta/documentId\",\n                                         \"meta/submissionDate\",\n                                         \"meta/creationTimestamp\",\n                                         \"environment/system/os/name\",\n                                         \"payload/info/reason\",\n                                         \"payload/info/sessionId\",\n                                         \"payload/info/subsessionId\",\n                                         \"payload/info/previousSessionId\",\n                                         \"payload/info/previousSubsessionId\",\n                                         \"payload/info/subsessionCounter\",\n                                         \"payload/info/profileSubsessionCounter\",\n                                         \"payload/simpleMeasurements/firstPaint\",\n                                         \"payload/simpleMeasurements/savedPings\",]) # fewer fields", "outputs": [], "metadata": {"collapsed": true, "trusted": true}}, {"execution_count": 5, "cell_type": "code", "source": "from operator import itemgetter\ndef dedupe_and_sort(group):\n    key, history = group\n    seen = set()\n    result = []\n    for fragment in history:\n        id = fragment[\"meta/documentId\"]\n        if id in seen:\n            continue          \n        seen.add(id)\n        result.append(fragment)    \n    result.sort(key=itemgetter(\"meta/creationTimestamp\"))\n    return result\n\ngrouped = subset_10.groupBy(lambda x: x[\"clientId\"]).map(dedupe_and_sort).collect()", "outputs": [], "metadata": {"collapsed": true, "trusted": true}}, {"execution_count": 6, "cell_type": "code", "source": "len(grouped) # 7,421 clients on 3 Aug, 7,571 on 4 Aug, 7,671 on 5 Aug", "outputs": [{"execution_count": 6, "output_type": "execute_result", "data": {"text/plain": "7671"}, "metadata": {}}], "metadata": {"collapsed": false, "trusted": true}}, {"source": "Up to here we have followed (not quite exactly)\nhttp://nbviewer.ipython.org/gist/vitillo/3047c0d896b08f75c403\nNext we'll get a set of histories with broken subsessionId chains.", "cell_type": "markdown", "metadata": {}}, {"execution_count": 7, "cell_type": "code", "source": "def clients_with_gaps(histories):\n    clients = {}\n    for history in histories:\n        last = history[0]\n        for current in history[1:]:\n            if last['payload/info/subsessionId'] != current['payload/info/previousSubsessionId']:\n                clients[current['clientId']] = history\n                break\n            last = current\n    return clients", "outputs": [], "metadata": {"collapsed": true, "trusted": true}}, {"execution_count": 8, "cell_type": "code", "source": "id_gaps = clients_with_gaps(grouped)", "outputs": [], "metadata": {"collapsed": true, "trusted": true}}, {"execution_count": 9, "cell_type": "code", "source": "len(id_gaps)", "outputs": [{"execution_count": 9, "output_type": "execute_result", "data": {"text/plain": "639"}, "metadata": {}}], "metadata": {"collapsed": false, "trusted": true}}, {"execution_count": 10, "cell_type": "code", "source": "example = id_gaps.items()[0]\n# This one was reversed\nexample[1][0]['payload/info/previousSessionId'], example[1][1]['payload/info/sessionId']", "outputs": [{"execution_count": 10, "output_type": "execute_result", "data": {"text/plain": "(u'59f5a5e5-d629-4f44-9ca2-9ffa9d1b01e9',\n u'59f5a5e5-d629-4f44-9ca2-9ffa9d1b01e9')"}, "metadata": {}}], "metadata": {"collapsed": false, "trusted": true}}, {"source": "So 639 of 7,671 clients may have one or more subsession\nchain gaps, and the first one we've looked at is a false\npositive. Let's try a couple angles.", "cell_type": "markdown", "metadata": {}}, {"execution_count": 12, "cell_type": "code", "source": "# How many of these have the \"missing\" subsessionId somewhere?\n# Alternately, how many span a session change?\nfrom collections import defaultdict\ndef gaps_when_sorted_by_counter(histories):\n    results = defaultdict(dict)\n    for history in histories:\n        history.sort(key=itemgetter(\"meta/creationTimestamp\"))\n        subids = set([d['payload/info/subsessionId'] for d in history])\n        last = history[0]\n        clientId = last['clientId']\n        for current in history[1:]:\n            if last['payload/info/subsessionId'] != current['payload/info/previousSubsessionId']:\n                if current['payload/info/previousSubsessionId'] in subids:\n                    results['order-switch'][clientId] = history\n                else:\n                    results['other'][clientId] = history\n            last = current\n    return results\nbreakdown = gaps_when_sorted_by_counter(id_gaps.values())\nfor k in breakdown:\n    print k, len(breakdown[k])", "outputs": [{"output_type": "stream", "name": "stdout", "text": "order-switch 231\nother 553\n"}], "metadata": {"collapsed": false, "trusted": true}}, {"source": "It seems reasonable to say that order-switches are not losses.", "cell_type": "markdown", "metadata": {}}, {"execution_count": 15, "cell_type": "code", "source": "def missing_ping_estimate(histories, k):\n    gaps = defaultdict(set) # {clientId: set([affected, dates])}\n    gaps_count = 0\n    losses_by_counter = 0\n    for history in histories:\n        history.sort(key=itemgetter(k))\n        ssids = set([d['payload/info/subsessionId'] for d in history])\n        pssids = set([d['payload/info/previousSubsessionId'] for d in history])\n        last = history[0]\n        for current in history[1:]:\n            last_id = last['payload/info/subsessionId']\n            current_id = current['payload/info/previousSubsessionId']\n            if last_id != current_id:\n                if (last_id not in pssids) and (current_id not in ssids):\n                    gaps_count += 1\n                    gaps[current['clientId']].add(last['meta/submissionDate'])\n                    gaps[current['clientId']].add(current['meta/submissionDate'])\n                    losses_by_counter += max(1,\n                        current['payload/info/profileSubsessionCounter'] - \\\n                        last['payload/info/profileSubsessionCounter'])\n            last = current\n    return {'gaps': gaps, 'gaps_count': gaps_count, 'losses_by_counter': losses_by_counter}\n\nhistories = breakdown['other'].values()\ngaps_by_stamp = missing_ping_estimate(histories, 'meta/creationTimestamp')\ngaps_by_counter = missing_ping_estimate(histories, 'payload/info/profileSubsessionCounter')\nprint \"By timestamp: %d gaps, %d losses(?)\" % (gaps_by_stamp['gaps_count'], gaps_by_stamp['losses_by_counter'])\nprint \"By pSC: %d gaps, %d losses(?)\" % (gaps_by_counter['gaps_count'], gaps_by_counter['losses_by_counter'])", "outputs": [{"output_type": "stream", "name": "stdout", "text": "By timestamp: 882 gaps, 2938 losses(?)\nBy pSC: 806 gaps, 10263 losses(?)\n"}], "metadata": {"collapsed": false, "trusted": true}}, {"execution_count": 22, "cell_type": "code", "source": "print \"Loss estimate, trusting creationTimestamp:\",\nprint round((100.0*gaps_by_stamp['losses_by_counter']) / tenth_total, 1), '%'\nprint \"Loss estimate, trusting profileSubsessionCounter:\",\nprint round((100.0*gaps_by_counter['losses_by_counter']) / tenth_total, 1), '%'", "outputs": [{"output_type": "stream", "name": "stdout", "text": "Loss estimate, trusting creationTimestamp: 2.2 %\nLoss estimate, trusting profileSubsessionCounter: 7.7 %\n"}], "metadata": {"collapsed": false, "trusted": true}}, {"source": "We don't really know how to map from \"previous*\" fields to lost pings. If we assume every positive gap records that many lost pings and every negative gap one lost ping (!), we get very different values depending on whether we start our sorting by timestamp or not. We probably should trust the timestamp more, says Georg.\n\nThere is one more identified source of false positives: channel-switching.", "cell_type": "markdown", "metadata": {}}, {"execution_count": 16, "cell_type": "code", "source": "lines = list(open('switchers_by_day.txt'))\nlines = [s.strip().split() for s in lines]\nchannel_switches = defaultdict(list)\nfor date, cid, _ in lines:\n    channel_switches[cid].append(date)\nchannel_switches.items()[0]", "outputs": [{"execution_count": 16, "output_type": "execute_result", "data": {"text/plain": "('d82de1f4-f0fe-442c-9864-676cc2649078', ['20150722', '20150729', '20150731'])"}, "metadata": {}}], "metadata": {"collapsed": false, "trusted": true}}, {"execution_count": 17, "cell_type": "code", "source": "switch_count_stamp = 0\nfor clientId, dates in gaps_by_stamp['gaps'].items():\n    switch_dates = channel_switches.get(clientId, [])\n    for switch in switch_dates:\n        # By making dates a set up a few cells, I may undercount.\n        if switch in dates:\n            switch_count_stamp += 1\nswitch_count_counter = 0\nfor clientId, dates in gaps_by_counter['gaps'].items():\n    switch_dates = channel_switches.get(clientId, [])\n    for switch in switch_dates:\n        if switch in dates:\n            switch_count_counter += 1\nprint \"Sloppy affected-by-channel-switch counts:\"\nprint \"By stamp: %d By counter: %d\" % (switch_count_stamp, switch_count_counter)", "outputs": [{"output_type": "stream", "name": "stdout", "text": "Sloppy affected-by-channel-switch counts:\nBy stamp: 213 By counter: 198\n"}], "metadata": {"collapsed": false, "trusted": true}}, {"source": "That's 25% of the gaps.", "cell_type": "markdown", "metadata": {}}, {"execution_count": 30, "cell_type": "code", "source": "def missing_pings_sans_switches(histories, k):\n    gaps_count = 0\n    losses_by_counter = 0\n    switch_client_hits = 0\n    switch_date_hits = 0\n    for history in histories:\n        history.sort(key=itemgetter(k))\n        ssids = set([d['payload/info/subsessionId'] for d in history])\n        pssids = set([d['payload/info/previousSubsessionId'] for d in history])\n        last = history[0]\n        switch_dates = channel_switches.get(last['clientId'], [])\n        switch_client_hits += int(bool(switch_dates))\n        for current in history[1:]:\n            last_id = last['payload/info/subsessionId']\n            current_id = current['payload/info/previousSubsessionId']\n            if last_id != current_id:\n                if (last_id not in pssids) and (current_id not in ssids):\n                    if last['meta/submissionDate'] in switch_dates or \\\n                        current['meta/submissionDate'] in switch_dates:\n                        switch_date_hits += 1\n                    else:\n                        gaps_count += 1\n                        losses_by_counter += max(1,\n                          current['payload/info/profileSubsessionCounter'] - \\\n                          last['payload/info/profileSubsessionCounter'])\n            last = current\n    return {'gaps_count': gaps_count, 'losses_by_counter': losses_by_counter,\n            'switch_client_hits': switch_client_hits, 'switch_date_hits': switch_date_hits}\ngaps_by_stamp = missing_pings_sans_switches(histories, 'meta/creationTimestamp')\ngaps_by_counter = missing_pings_sans_switches(histories, 'payload/info/profileSubsessionCounter')\nprint \"By timestamp: %d gaps, %d losses(?)\" % (gaps_by_stamp['gaps_count'], gaps_by_stamp['losses_by_counter'])\nprint \" switch hits:\", gaps_by_stamp['switch_client_hits'], gaps_by_stamp['switch_date_hits']\nprint \"By pSC: %d gaps, %d losses(?)\" % (gaps_by_counter['gaps_count'], gaps_by_counter['losses_by_counter'])\nprint \" switch hits:\", gaps_by_counter['switch_client_hits'], gaps_by_counter['switch_date_hits']", "outputs": [{"output_type": "stream", "name": "stdout", "text": " By timestamp: 620 gaps, 1599 losses(?)\n switch hits: 100 262\nBy pSC: 561 gaps, 8397 losses(?)\n switch hits: 100 245\n"}], "metadata": {"collapsed": false, "trusted": true}}, {"execution_count": 31, "cell_type": "code", "source": "print \"Loss estimate, creationTimestamp sans switch dates:\",\nprint round((100.0*gaps_by_stamp['losses_by_counter']) / tenth_total, 1), '%'\nprint \"Loss estimate, profileSubsessionCounter sans switch dates:\",\nprint round((100.0*gaps_by_counter['losses_by_counter']) / tenth_total, 1), '%'", "outputs": [{"output_type": "stream", "name": "stdout", "text": "Loss estimate, creationTimestamp sans switch dates: 1.2 %\nLoss estimate, profileSubsessionCounter sans switch dates: 6.3 %\n"}], "metadata": {"collapsed": false, "trusted": true}}], "nbformat": 4, "metadata": {"kernelspec": {"display_name": "Python 2", "name": "python2", "language": "python"}, "language_info": {"mimetype": "text/x-python", "nbconvert_exporter": "python", "version": "2.7.9", "name": "python", "file_extension": ".py", "pygments_lexer": "ipython2", "codemirror_mode": {"version": 2, "name": "ipython"}}}}