Last active
August 29, 2015 14:22
-
-
Save SamPenrose/1a8133d2ef6d251addfc to your computer and use it in GitHub Desktop.
Mozilla FHRv4 subsessionId duplicates
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{"nbformat_minor": 0, "cells": [{"execution_count": 1, "cell_type": "code", "source": "import ujson as json\nimport matplotlib.pyplot as plt\nimport pandas as pd\nimport numpy as np\nimport plotly.plotly as py\nimport networkx as nx\nimport collections\n\n\nfrom moztelemetry import get_pings, get_pings_properties, get_one_ping_per_client\n\n%pylab inline", "outputs": [{"output_type": "stream", "name": "stdout", "text": "Populating the interactive namespace from numpy and matplotlib\n"}], "metadata": {"collapsed": false, "trusted": true}}, {"execution_count": 3, "cell_type": "code", "source": "pings = get_pings(sc, app=\"Firefox\",\n channel=\"nightly\",\n submission_date=(\"20150507\",\"20150514\"),\n fraction=1,\n schema=\"v4\")", "outputs": [], "metadata": {"collapsed": true, "trusted": true}}, {"execution_count": 38, "cell_type": "code", "source": "def extract_sub(p):\n return p.get('payload', {}).get('info', {}).get('subsessionId', 'NO_SUBSESSION_ID')", "outputs": [], "metadata": {"collapsed": true, "trusted": true}}, {"execution_count": 39, "cell_type": "code", "source": "by_sub_map = pings.map(extract_sub)", "outputs": [], "metadata": {"collapsed": true, "trusted": true}}, {"execution_count": 41, "cell_type": "code", "source": "sub_ids = by_sub_map.collect()", "outputs": [], "metadata": {"collapsed": true, "trusted": true}}, {"execution_count": 46, "cell_type": "code", "source": "def dupes(l):\n from collections import defaultdict\n seen = set()\n dupes = defaultdict(int)\n for v in l:\n if v in seen:\n dupes[v] += 1\n else:\n seen.add(v)\n return dupes", "outputs": [], "metadata": {"collapsed": true, "trusted": true}}, {"execution_count": 49, "cell_type": "code", "source": "dd = dupes(sub_ids)", "outputs": [], "metadata": {"collapsed": false, "trusted": true}}, {"execution_count": 50, "cell_type": "code", "source": "len(dd)", "outputs": [{"execution_count": 50, "output_type": "execute_result", "data": {"text/plain": "17246"}, "metadata": {}}], "metadata": {"collapsed": false, "trusted": true}}, {"execution_count": 54, "cell_type": "code", "source": "len(sub_ids)", "outputs": [{"execution_count": 54, "output_type": "execute_result", "data": {"text/plain": "1594174"}, "metadata": {}}], "metadata": {"collapsed": false, "trusted": true}}, {"execution_count": 58, "cell_type": "code", "source": "dk = set(dd.keys())", "outputs": [], "metadata": {"collapsed": true, "trusted": true}}, {"execution_count": 66, "cell_type": "code", "source": "def save_multi_subsession(p):\n if p['id'] not in dk:\n return\n return (p['id'], p)", "outputs": [], "metadata": {"collapsed": true, "trusted": true}}, {"execution_count": 67, "cell_type": "code", "source": "dupe_map = pings.map(save_multi_subsession)", "outputs": [], "metadata": {"collapsed": false, "trusted": true}}, {"execution_count": 69, "cell_type": "code", "source": "def reduce_multi_subsession(ping1, ping2):\n if not (ping1 and ping2):\n return []\n if 'meta' in ping1:\n del ping1['meta']\n if 'meta' in ping2:\n del ping2['meta']\n if ping1 != ping2:\n return [ping1, ping2]", "outputs": [], "metadata": {"collapsed": true, "trusted": true}}, {"execution_count": 70, "cell_type": "code", "source": "diffs = dupe_map.reduce(reduce_multi_subsession)", "outputs": [], "metadata": {"collapsed": false, "trusted": true}}, {"execution_count": 71, "cell_type": "code", "source": "len(diffs)", "outputs": [{"execution_count": 71, "output_type": "execute_result", "data": {"text/plain": "0"}, "metadata": {}}], "metadata": {"collapsed": false, "trusted": true}}, {"execution_count": null, "cell_type": "code", "source": "", "outputs": [], "metadata": {"collapsed": true, "trusted": true}}], "nbformat": 4, "metadata": {"kernelspec": {"display_name": "Python 2", "name": "python2", "language": "python"}, "language_info": {"mimetype": "text/x-python", "nbconvert_exporter": "python", "version": "2.7.9", "name": "python", "file_extension": ".py", "pygments_lexer": "ipython2", "codemirror_mode": {"version": 2, "name": "ipython"}}}} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment