Skip to content

Instantly share code, notes, and snippets.

@mhammond
Created September 1, 2016 11:03
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mhammond/1301489b8ceae7976bfe237681fb236b to your computer and use it in GitHub Desktop.
Save mhammond/1301489b8ceae7976bfe237681fb236b to your computer and use it in GitHub Desktop.
bug 1298758 - corrupt places DB analysis
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Unable to parse whitelist (/home/hadoop/anaconda2/lib/python2.7/site-packages/moztelemetry/histogram-whitelists.json). Assuming all histograms are acceptable.\n"
]
}
],
"source": [
"from moztelemetry import get_pings, get_pings_properties\n",
"\n",
"# XXX - we should:\n",
"# * Work out how to exclude records before bug 1288445 landed (the same user is represented with\n",
"# 2 different IDs if pings before and after that patch are considered)\n",
"# * Work out some sane way of excluding \"problem pings\" that are probably yet to\n",
"# have places auto-fix their DB.\n",
"pings = get_pings_properties(get_pings(sc, doc_type='sync', fraction=1.0),\n",
" [\"payload/uid\", \"payload/when\", \"payload/engines\"])"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# Filter and group by userid.\n",
"\n",
"# Only pings with a bookmark Sync.\n",
"def enginePings(p):\n",
" return p.get(\"payload/engines\") and len(filter(lambda e: e[\"name\"] == \"bookmarks\", p[\"payload/engines\"])) > 0\n",
"\n",
"# group them by the UID.\n",
"grouped = pings.filter(enginePings).map(lambda p: (p[\"payload/uid\"], p)).groupByKey()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Number of users with a bookmark sync: 47830\n",
"Number with errors: 8\n",
"Number where errors went away: 6\n"
]
}
],
"source": [
"# grouped is now a \"list\" of (uid, all_bookmark_pings_for_user)\n",
"\n",
"# Use a map function to simplify the structure and flag where the errors are seen.\n",
"# XXX - note that I'm also seeing this error for the \"tabs\" engine in some cases.\n",
"# I predict history will see it too - but for now, just consider bookmarks.\n",
"def mapErrors((uid, pings)):\n",
" sawError = fixed = False\n",
" for ping in sorted(pings, key=lambda p: p[\"payload/when\"]):\n",
" engines = ping[\"payload/engines\"]\n",
" for e in engines:\n",
" if e[\"name\"] == \"bookmarks\":\n",
" if \"failureReason\" not in e:\n",
" # It worked! If this after we saw the corruption error it must have been \"fixed\".\n",
" if sawError:\n",
" fixed = True\n",
" break # there's no need to inspect any more...\n",
" elif e[\"failureReason\"][\"name\"] == \"unexpectederror\" and \\\n",
" e[\"failureReason\"][\"error\"] == \"Error: Error(s) encountered during statement execution: database disk image is malformed\":\n",
" # It's one of ours\n",
" sawError = True\n",
" # else it's some other error...\n",
" return sawError, fixed\n",
"\n",
"print \"Number of users with a bookmark sync:\", grouped.count()\n",
"\n",
"withErrors = grouped.map(mapErrors).filter(lambda (sawError, fixed): sawError)\n",
"print \"Number with errors:\", withErrors.count()\n",
"print \"Number where errors went away:\", withErrors.filter(lambda (sawError, fixed): fixed).count()\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.11"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
# coding: utf-8
# In[1]:
from moztelemetry import get_pings, get_pings_properties
# XXX - we should:
# * Work out how to exclude records before bug 1288445 landed (the same user is represented with
# 2 different IDs if pings before and after that patch are considered)
# * Work out some sane way of excluding "problem pings" that are probably yet to
# have places auto-fix their DB.
pings = get_pings_properties(get_pings(sc, doc_type='sync', fraction=1.0),
["payload/uid", "payload/when", "payload/engines"])
# In[2]:
# Filter and group by userid.
# Only pings with a bookmark Sync.
def enginePings(p):
return p.get("payload/engines") and len(filter(lambda e: e["name"] == "bookmarks", p["payload/engines"])) > 0
# group them by the UID.
grouped = pings.filter(enginePings).map(lambda p: (p["payload/uid"], p)).groupByKey()
# In[3]:
# grouped is now a "list" of (uid, all_bookmark_pings_for_user)
# Use a map function to simplify the structure and flag where the errors are seen.
# XXX - note that I'm also seeing this error for the "tabs" engine in some cases.
# I predict history will see it too - but for now, just consider bookmarks.
def mapErrors((uid, pings)):
sawError = fixed = False
for ping in sorted(pings, key=lambda p: p["payload/when"]):
engines = ping["payload/engines"]
for e in engines:
if e["name"] == "bookmarks":
if "failureReason" not in e:
# It worked! If this after we saw the corruption error it must have been "fixed".
if sawError:
fixed = True
break # there's no need to inspect any more...
elif e["failureReason"]["name"] == "unexpectederror" and e["failureReason"]["error"] == "Error: Error(s) encountered during statement execution: database disk image is malformed":
# It's one of ours
sawError = True
# else it's some other error...
return sawError, fixed
print "Number of users with a bookmark sync:", grouped.count()
withErrors = grouped.map(mapErrors).filter(lambda (sawError, fixed): sawError)
print "Number with errors:", withErrors.count()
print "Number where errors went away:", withErrors.filter(lambda (sawError, fixed): fixed).count()
# In[ ]:
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment