Skip to content

Instantly share code, notes, and snippets.

@SamPenrose
Last active August 29, 2015 14:22
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save SamPenrose/1a8133d2ef6d251addfc to your computer and use it in GitHub Desktop.
Save SamPenrose/1a8133d2ef6d251addfc to your computer and use it in GitHub Desktop.
Mozilla FHRv4 subsessionId duplicates
{"nbformat_minor": 0, "cells": [{"execution_count": 1, "cell_type": "code", "source": "import ujson as json\nimport matplotlib.pyplot as plt\nimport pandas as pd\nimport numpy as np\nimport plotly.plotly as py\nimport networkx as nx\nimport collections\n\n\nfrom moztelemetry import get_pings, get_pings_properties, get_one_ping_per_client\n\n%pylab inline", "outputs": [{"output_type": "stream", "name": "stdout", "text": "Populating the interactive namespace from numpy and matplotlib\n"}], "metadata": {"collapsed": false, "trusted": true}}, {"execution_count": 3, "cell_type": "code", "source": "pings = get_pings(sc, app=\"Firefox\",\n channel=\"nightly\",\n submission_date=(\"20150507\",\"20150514\"),\n fraction=1,\n schema=\"v4\")", "outputs": [], "metadata": {"collapsed": true, "trusted": true}}, {"execution_count": 38, "cell_type": "code", "source": "def extract_sub(p):\n return p.get('payload', {}).get('info', {}).get('subsessionId', 'NO_SUBSESSION_ID')", "outputs": [], "metadata": {"collapsed": true, "trusted": true}}, {"execution_count": 39, "cell_type": "code", "source": "by_sub_map = pings.map(extract_sub)", "outputs": [], "metadata": {"collapsed": true, "trusted": true}}, {"execution_count": 41, "cell_type": "code", "source": "sub_ids = by_sub_map.collect()", "outputs": [], "metadata": {"collapsed": true, "trusted": true}}, {"execution_count": 46, "cell_type": "code", "source": "def dupes(l):\n from collections import defaultdict\n seen = set()\n dupes = defaultdict(int)\n for v in l:\n if v in seen:\n dupes[v] += 1\n else:\n seen.add(v)\n return dupes", "outputs": [], "metadata": {"collapsed": true, "trusted": true}}, {"execution_count": 49, "cell_type": "code", "source": "dd = dupes(sub_ids)", "outputs": [], "metadata": {"collapsed": false, "trusted": true}}, {"execution_count": 50, "cell_type": "code", "source": "len(dd)", "outputs": [{"execution_count": 50, "output_type": "execute_result", "data": {"text/plain": "17246"}, "metadata": {}}], "metadata": {"collapsed": false, "trusted": true}}, {"execution_count": 54, "cell_type": "code", "source": "len(sub_ids)", "outputs": [{"execution_count": 54, "output_type": "execute_result", "data": {"text/plain": "1594174"}, "metadata": {}}], "metadata": {"collapsed": false, "trusted": true}}, {"execution_count": 58, "cell_type": "code", "source": "dk = set(dd.keys())", "outputs": [], "metadata": {"collapsed": true, "trusted": true}}, {"execution_count": 66, "cell_type": "code", "source": "def save_multi_subsession(p):\n if p['id'] not in dk:\n return\n return (p['id'], p)", "outputs": [], "metadata": {"collapsed": true, "trusted": true}}, {"execution_count": 67, "cell_type": "code", "source": "dupe_map = pings.map(save_multi_subsession)", "outputs": [], "metadata": {"collapsed": false, "trusted": true}}, {"execution_count": 69, "cell_type": "code", "source": "def reduce_multi_subsession(ping1, ping2):\n if not (ping1 and ping2):\n return []\n if 'meta' in ping1:\n del ping1['meta']\n if 'meta' in ping2:\n del ping2['meta']\n if ping1 != ping2:\n return [ping1, ping2]", "outputs": [], "metadata": {"collapsed": true, "trusted": true}}, {"execution_count": 70, "cell_type": "code", "source": "diffs = dupe_map.reduce(reduce_multi_subsession)", "outputs": [], "metadata": {"collapsed": false, "trusted": true}}, {"execution_count": 71, "cell_type": "code", "source": "len(diffs)", "outputs": [{"execution_count": 71, "output_type": "execute_result", "data": {"text/plain": "0"}, "metadata": {}}], "metadata": {"collapsed": false, "trusted": true}}, {"execution_count": null, "cell_type": "code", "source": "", "outputs": [], "metadata": {"collapsed": true, "trusted": true}}], "nbformat": 4, "metadata": {"kernelspec": {"display_name": "Python 2", "name": "python2", "language": "python"}, "language_info": {"mimetype": "text/x-python", "nbconvert_exporter": "python", "version": "2.7.9", "name": "python", "file_extension": ".py", "pygments_lexer": "ipython2", "codemirror_mode": {"version": 2, "name": "ipython"}}}}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment