Created
July 1, 2015 23:33
-
-
Save SamPenrose/05e4dd652c6b95fec6bc to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{"nbformat_minor": 0, "cells": [{"source": "# Session Signature matching", "cell_type": "markdown", "metadata": {}}, {"execution_count": 81, "cell_type": "code", "source": "import ujson as json\nfrom operator import add\n# %pylab inline", "outputs": [], "metadata": {"collapsed": false, "trusted": true}}, {"execution_count": 82, "cell_type": "code", "source": "outBucketName = \"net-mozaws-prod-us-west-2-pipeline-analysis\"\npathToOutput = \"/bcolloran/mergedDataPerClient/nightly/2015-06-15/10009clients/\"", "outputs": [], "metadata": {"collapsed": true, "trusted": true}}, {"execution_count": 83, "cell_type": "code", "source": "# for a tiny sample, you can load one part: \"part-00000\"\n# or you can do more--\n# ten parts: part-0000*\n# or 10% of parts: part-*0\n# or all parts: part-*\npath_to_all = \"s3n://\"+outBucketName+pathToOutput+\"part-*\"\nf = sc.sequenceFile(path_to_all)\nload_all = f.mapValues(json.loads)", "outputs": [], "metadata": {"collapsed": true, "trusted": true}}, {"execution_count": 84, "cell_type": "code", "source": "def get_v4_searches(v4):\n total = 0\n for d in v4:\n hist = d.get('payload/keyedHistograms/SEARCH_COUNTS', {})\n if hist == 'MISSING':\n continue\n for blob in hist.values():\n total += blob.get('sum', 0)\n return {'v4': total}", "outputs": [], "metadata": {"collapsed": true, "trusted": true}}, {"execution_count": 85, "cell_type": "code", "source": "def get_v2_searches(v2):\n days = v2.get('data', {}).get('days', {}).values()\n total = 0\n for d in days:\n search_dict = d.get('org.mozilla.searches.counts', {})\n counts = [v for k, v in search_dict.items() if k != '_v']\n total += sum(counts)\n return {'v2': int(total)}", "outputs": [], "metadata": {"collapsed": true, "trusted": true}}, {"execution_count": 86, "cell_type": "code", "source": "all_v2_kv = load_all.mapValues(lambda d: get_v2_searches(d['v2']))", "outputs": [], "metadata": {"collapsed": true, "trusted": true}}, {"execution_count": 87, "cell_type": "code", "source": "v2_total = all_v2_kv.mapValues(lambda d: d['v2']).values().sum()\nv2_total # 2103270", "outputs": [{"execution_count": 87, "output_type": "execute_result", "data": {"text/plain": "2103270"}, "metadata": {}}], "metadata": {"collapsed": false, "trusted": true}}, {"execution_count": 88, "cell_type": "code", "source": "all_v4_kv = load_all.mapValues(lambda d: get_v4_searches(d['v4']))", "outputs": [], "metadata": {"collapsed": true, "trusted": true}}, {"execution_count": null, "cell_type": "code", "source": "v4_total = all_v4_kv.mapValues(lambda d: d['v4']).values().sum()\nv4_total # 290328", "outputs": [], "metadata": {"collapsed": true, "trusted": true}}], "nbformat": 4, "metadata": {"kernelspec": {"display_name": "Python 2", "name": "python2", "language": "python"}, "language_info": {"mimetype": "text/x-python", "nbconvert_exporter": "python", "version": "2.7.9", "name": "python", "file_extension": ".py", "pygments_lexer": "ipython2", "codemirror_mode": {"version": 2, "name": "ipython"}}}} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment