Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save SamPenrose/05e4dd652c6b95fec6bc to your computer and use it in GitHub Desktop.
Save SamPenrose/05e4dd652c6b95fec6bc to your computer and use it in GitHub Desktop.
{"nbformat_minor": 0, "cells": [{"source": "# Session Signature matching", "cell_type": "markdown", "metadata": {}}, {"execution_count": 81, "cell_type": "code", "source": "import ujson as json\nfrom operator import add\n# %pylab inline", "outputs": [], "metadata": {"collapsed": false, "trusted": true}}, {"execution_count": 82, "cell_type": "code", "source": "outBucketName = \"net-mozaws-prod-us-west-2-pipeline-analysis\"\npathToOutput = \"/bcolloran/mergedDataPerClient/nightly/2015-06-15/10009clients/\"", "outputs": [], "metadata": {"collapsed": true, "trusted": true}}, {"execution_count": 83, "cell_type": "code", "source": "# for a tiny sample, you can load one part: \"part-00000\"\n# or you can do more--\n# ten parts: part-0000*\n# or 10% of parts: part-*0\n# or all parts: part-*\npath_to_all = \"s3n://\"+outBucketName+pathToOutput+\"part-*\"\nf = sc.sequenceFile(path_to_all)\nload_all = f.mapValues(json.loads)", "outputs": [], "metadata": {"collapsed": true, "trusted": true}}, {"execution_count": 84, "cell_type": "code", "source": "def get_v4_searches(v4):\n total = 0\n for d in v4:\n hist = d.get('payload/keyedHistograms/SEARCH_COUNTS', {})\n if hist == 'MISSING':\n continue\n for blob in hist.values():\n total += blob.get('sum', 0)\n return {'v4': total}", "outputs": [], "metadata": {"collapsed": true, "trusted": true}}, {"execution_count": 85, "cell_type": "code", "source": "def get_v2_searches(v2):\n days = v2.get('data', {}).get('days', {}).values()\n total = 0\n for d in days:\n search_dict = d.get('org.mozilla.searches.counts', {})\n counts = [v for k, v in search_dict.items() if k != '_v']\n total += sum(counts)\n return {'v2': int(total)}", "outputs": [], "metadata": {"collapsed": true, "trusted": true}}, {"execution_count": 86, "cell_type": "code", "source": "all_v2_kv = load_all.mapValues(lambda d: get_v2_searches(d['v2']))", "outputs": [], "metadata": {"collapsed": true, "trusted": true}}, {"execution_count": 87, "cell_type": "code", "source": "v2_total = all_v2_kv.mapValues(lambda d: d['v2']).values().sum()\nv2_total # 2103270", "outputs": [{"execution_count": 87, "output_type": "execute_result", "data": {"text/plain": "2103270"}, "metadata": {}}], "metadata": {"collapsed": false, "trusted": true}}, {"execution_count": 88, "cell_type": "code", "source": "all_v4_kv = load_all.mapValues(lambda d: get_v4_searches(d['v4']))", "outputs": [], "metadata": {"collapsed": true, "trusted": true}}, {"execution_count": null, "cell_type": "code", "source": "v4_total = all_v4_kv.mapValues(lambda d: d['v4']).values().sum()\nv4_total # 290328", "outputs": [], "metadata": {"collapsed": true, "trusted": true}}], "nbformat": 4, "metadata": {"kernelspec": {"display_name": "Python 2", "name": "python2", "language": "python"}, "language_info": {"mimetype": "text/x-python", "nbconvert_exporter": "python", "version": "2.7.9", "name": "python", "file_extension": ".py", "pygments_lexer": "ipython2", "codemirror_mode": {"version": 2, "name": "ipython"}}}}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment