Skip to content

Instantly share code, notes, and snippets.

@georgf
Last active April 7, 2016 09:46
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save georgf/083dac118168b543c71f7ca011a3d4e8 to your computer and use it in GitHub Desktop.
Save georgf/083dac118168b543c71f7ca011a3d4e8 to your computer and use it in GitHub Desktop.
Validating core ping with v=2 after defaultSearch landed
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Bug 1249288 - Validation \"core\" pings defaultSearch field"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Validate \"core\" pings sent by Firefox for Android to make sure the data they contain makes sense."
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": false,
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Populating the interactive namespace from numpy and matplotlib\n"
]
}
],
"source": [
"import ujson as json\n",
"import matplotlib.pyplot as plt\n",
"import pandas as pd\n",
"import numpy as np\n",
"import plotly.plotly as py\n",
"import datetime as dt\n",
"from uuid import UUID\n",
"\n",
"from moztelemetry import get_pings, get_pings_properties, get_one_ping_per_client, get_clients_history\n",
"\n",
"%pylab inline"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"submission_dates = (\"20160329\", \"20160410\")\n",
"core_pings = get_pings(sc,\n",
" app=\"Fennec\",\n",
" channel=\"nightly\",\n",
" doc_type=\"core\",\n",
" source_version=\"2\",\n",
" submission_date=submission_dates,\n",
" fraction=1.0)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"38403"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pings_count = core_pings.count()\n",
"pings_count"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### How many different clients are we seeing?"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"2102"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"one_per_client = get_one_ping_per_client(core_pings)\n",
"num_clients = one_per_client.count()\n",
"num_clients"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Are the pings respecting our desired schema?"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {
"collapsed": false,
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"defaultdict(int,\n",
" {'': 37287,\n",
" 'missing key: profileDate': 791,\n",
" 'wrong type: defaultSearch': 325})"
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"def core_ping_check(p):\n",
" # That's a sort-of schema to validate the required fields and their types.\n",
" req_fields = {\n",
" \"v\": [int],\n",
" \"clientId\": [unicode],\n",
" \"seq\": [int],\n",
" \"locale\": [unicode],\n",
" \"os\": [unicode],\n",
" \"osversion\": [unicode],\n",
" \"device\": [unicode],\n",
" \"arch\": [unicode],\n",
" \"profileDate\": [int, long],\n",
" \"defaultSearch\": [unicode, None]\n",
" }\n",
" \n",
" opt_fields = {\n",
" \"experiments\": list,\n",
" }\n",
" \n",
" # Does the ping contain all the required top-level fields?\n",
" for k, types in req_fields.iteritems():\n",
" if not k in p:\n",
" return (\"missing key: \" + k, p)\n",
" if type(p[k]) not in types:\n",
" return (\"wrong type: \" + k, p)\n",
" \n",
" # Does it contain any optional field? If so, make sure it has the correct type. \n",
" for k, v in opt_fields.iteritems():\n",
" if k in p:\n",
" if type(p[k]) != v:\n",
" return (\"wrong type: \" + k, p)\n",
" \n",
" # Perform some additional sanity checks.\n",
" if p[\"v\"] < 1:\n",
" return (\"check failed: ping.v < 1\", p)\n",
" if p[\"seq\"] < 0:\n",
" return (\"check failed: ping.seq < 0\", p)\n",
" if p[\"profileDate\"] < 0:\n",
" return (\"check failed: ping.profileDate < 0\", p)\n",
" if p[\"profileDate\"] < 10957: # profileDates before the year 2000?\n",
" return (\"check failed: ping.profileDate < 10957\", p)\n",
" if p[\"profileDate\"] > 17167: # profileDates after the year 2016?\n",
" return (\"check failed: ping.profileDate > 17167\", p)\n",
" if len(p[\"defaultSearch\"]) < 1:\n",
" return (\"check failed: ping.defaultSearch length < 1\")\n",
" if len(p[\"defaultSearch\"]) > 20:\n",
" return (\"check failed: ping.defaultSearch length > 20\")\n",
" \n",
" # Validate the clientId.\n",
" try:\n",
" UUID(p[\"clientId\"], version=4)\n",
" except ValueError:\n",
" return (\"check failed: clientId is UUID\", p)\n",
" \n",
" return (\"\", p)\n",
"\n",
"checked_pings = core_pings.map(core_ping_check)\n",
"result_counts = checked_pings.countByKey()\n",
"result_counts"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"So we have broken pings. Let's check examples for the types of failures:"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"grouped_checked_pings = checked_pings.filter(lambda t: t[0] != '')\\\n",
" .groupByKey()\\\n",
" .collectAsMap()"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[('wrong type: defaultSearch',\n",
" {u'arch': u'armeabi-v7a',\n",
" u'clientId': '...',\n",
" u'defaultSearch': None,\n",
" u'device': u'LGE-Nexus 5',\n",
" u'experiments': [u'bookmark-history-menu',\n",
" u'offline-cache',\n",
" u'search-term',\n",
" u'content-notifications-12hrs'],\n",
" u'locale': u'it-IT',\n",
" 'meta': {u'Host': u'incoming.telemetry.mozilla.org',\n",
" 'Hostname': u'ip-172-31-15-33',\n",
" u'Size': 302.0,\n",
" 'Timestamp': 1460007217536733952L,\n",
" 'Type': u'telemetry',\n",
" u'appBuildId': u'20160406030221',\n",
" u'appName': u'Fennec',\n",
" u'appUpdateChannel': u'nightly',\n",
" u'appVersion': u'48.0a1',\n",
" u'clientId': '...',\n",
" u'docType': u'core',\n",
" u'documentId': '...',\n",
" u'geoCity': u'Rieti',\n",
" u'geoCountry': u'IT',\n",
" u'sampleId': 51.0,\n",
" u'sourceName': u'telemetry',\n",
" u'sourceVersion': u'2',\n",
" u'submissionDate': u'20160407'},\n",
" u'os': u'Android',\n",
" u'osversion': u'23',\n",
" u'profileDate': 16635,\n",
" u'seq': 1615,\n",
" u'v': 2}),\n",
" ('missing key: profileDate',\n",
" {u'arch': u'armeabi-v7a',\n",
" u'clientId': '...',\n",
" u'defaultSearch': u'yahoo',\n",
" u'device': u'LGE-Nexus 5',\n",
" u'experiments': [],\n",
" u'locale': u'en-US',\n",
" 'meta': {u'Host': u'incoming.telemetry.mozilla.org',\n",
" 'Hostname': u'ip-172-31-25-112',\n",
" u'Size': 200.0,\n",
" 'Timestamp': 1459445111084972544L,\n",
" 'Type': u'telemetry',\n",
" u'appBuildId': u'20160331030231',\n",
" u'appName': u'Fennec',\n",
" u'appUpdateChannel': u'nightly',\n",
" u'appVersion': u'48.0a1',\n",
" u'clientId': '...',\n",
" u'docType': u'core',\n",
" u'documentId': '...',\n",
" u'geoCity': u'Mountain View',\n",
" u'geoCountry': u'US',\n",
" u'sampleId': 4.0,\n",
" u'sourceName': u'telemetry',\n",
" u'sourceVersion': u'2',\n",
" u'submissionDate': u'20160331'},\n",
" u'os': u'Android',\n",
" u'osversion': u'19',\n",
" u'seq': 77,\n",
" u'v': 2})]"
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"def sanitized_first(t):\n",
" p = list(t[1])[0]\n",
" p['clientId'] = '...'\n",
" p['meta']['clientId'] = '...'\n",
" p['meta']['documentId'] = '...'\n",
" return (t[0], p)\n",
"map(sanitized_first, grouped_checked_pings.iteritems())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Lets see what search engines are submitted."
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"defaultdict(int,\n",
" {None: 325,\n",
" u'amazondotcom': 2,\n",
" u'baidu': 1341,\n",
" u'bing': 350,\n",
" u'duckduckgo': 932,\n",
" u'google': 25337,\n",
" u'wikipedia-de': 1,\n",
" u'wikipedia-es': 10,\n",
" u'yahoo': 8719,\n",
" u'yahoo-france': 29,\n",
" u'yandex-ru': 1357})"
]
},
"execution_count": 32,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"def get_engine(p):\n",
" if not \"defaultSearch\" in p:\n",
" return None\n",
" return p[\"defaultSearch\"]\n",
"\n",
"engines = core_pings.map(get_engine)\n",
"engine_counts = engines.countByValue()\n",
"engine_counts"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Lets get percentages for that."
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[(u'baidu', 3.492),\n",
" (u'google', 65.977),\n",
" (u'duckduckgo', 2.427),\n",
" (u'wikipedia-es', 0.026),\n",
" (u'bing', 0.911),\n",
" (u'amazondotcom', 0.005),\n",
" (u'yahoo', 22.704),\n",
" (None, 0.846),\n",
" (u'yahoo-france', 0.076),\n",
" (u'yandex-ru', 3.534),\n",
" (u'wikipedia-de', 0.003)]"
]
},
"execution_count": 34,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"[(k, round((float(v) / pings_count) * 100, 3)) for k, v in engine_counts.iteritems()]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.11"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment