Skip to content

Instantly share code, notes, and snippets.

@Uberi
Created April 14, 2016 16:55
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Uberi/f17a3647ea3f9d087c576e8fe5b112c0 to your computer and use it in GitHub Desktop.
Save Uberi/f17a3647ea3f9d087c576e8fe5b112c0 to your computer and use it in GitHub Desktop.
crash-rate-aggregates-watchdog
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Crash Rate Aggregates Watchdog\n",
"Watches the output directory of the [crash rate aggregates](https://github.com/mozilla/moz-crash-rate-aggregates) job on S3 to make sure it's properly outputting results. If the crash rate aggregates job ever fails, this notebook detects that and sends out an alert email."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Configuration options:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"S3_BUCKET = \"telemetry-parquet\" # S3 bucket name\n",
"S3_PREFIX = \"crash_aggregates/v1/\" # must end with a slash\n",
"\n",
"FROM_ADDR = \"telemetry-alerts@mozilla.com\" # email address to send alerts from\n",
"GENERAL_TELEMETRY_ALERT = \"dev-telemetry-alerts@lists.mozilla.org\" # email address that will receive notifications"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import boto\n",
"\n",
"from email.mime.application import MIMEApplication\n",
"from email.mime.multipart import MIMEMultipart\n",
"from email.mime.text import MIMEText\n",
"\n",
"def send_ses(fromaddr,\n",
" subject,\n",
" body,\n",
" recipient,\n",
" filename=''):\n",
" \"\"\"Send an email via the Amazon SES service.\n",
"\n",
"Example:\n",
" send_ses('me@example.com, 'greetings', \"Hi!\", 'you@example.com)\n",
"\n",
"Return:\n",
" If 'ErrorResponse' appears in the return message from SES,\n",
" return the message, otherwise return an empty '' string.\"\"\"\n",
" msg = MIMEMultipart()\n",
" msg['Subject'] = subject\n",
" msg['From'] = fromaddr\n",
" msg['To'] = recipient\n",
" msg.attach(MIMEText(body))\n",
"\n",
" if filename:\n",
" attachment = open(filename, 'rb').read()\n",
" part = MIMEApplication(attachment)\n",
" part.add_header('Content-Disposition', 'attachment', filename=filename)\n",
" msg.attach(part)\n",
"\n",
" conn = boto.connect_ses()\n",
" result = conn.send_raw_email(msg.as_string())\n",
"\n",
" return result if 'ErrorResponse' in result else ''"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"import os\n",
"import sys\n",
"import re\n",
"from datetime import datetime, date, timedelta\n",
"\n",
"import boto\n",
"\n",
"def print_help():\n",
" print \"Check if the crash rate aggregator job is giving the expected output.\"\n",
" print \"Usage: {} email|test\".format(sys.argv[0])\n",
" print \" {} email [YYYY-MM-DD] if crash aggregates haven't been updated in about a day as of YYYY-MM-DD (defaults to current date), email the telemetry alerts mailing list saying so\".format(sys.argv[0])\n",
" print \" {} test [YYYY-MM-DD] print out whether crash aggregates have been updated in about a day as of YYYY-MM-DD (defaults to current date)\".format(sys.argv[0])\n",
"\n",
"def is_job_failing(current_date):\n",
" # obtain the S3 bucket\n",
" conn = boto.s3.connect_to_region(\"us-west-2\", host=\"s3-us-west-2.amazonaws.com\")\n",
" try:\n",
" bucket = conn.get_bucket(S3_BUCKET, validate=False)\n",
" except boto.exception.S3ResponseError: # bucket doesn't exist\n",
" return True\n",
"\n",
" # list all of the prefixes under the given one\n",
" crash_aggregate_partitions = bucket.list(prefix=S3_PREFIX, delimiter=\"/\")\n",
" start, end = current_date - timedelta(days=2), current_date\n",
" for partition in crash_aggregate_partitions:\n",
" match = re.search(r\"/submission_date=(\\d\\d\\d\\d-\\d\\d-\\d\\d)/$\", partition.name)\n",
" if not match: continue\n",
" submission_date = datetime.strptime(match.group(1), \"%Y-%m-%d\").date()\n",
" if start <= submission_date <= end:\n",
" return False # found suitable partition, job is working\n",
"\n",
" # no suitable partition found, job is failing\n",
" return True"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"now = date.today()\n",
"if is_job_failing(now):\n",
" print(\"Sending email notification about crash aggregates not being updated to {}.\".format(GENERAL_TELEMETRY_ALERT))\n",
" email_body = (\n",
" \"As of {}, the daily crash aggregates job [1] has not output results for 2 days. This is an automated message from Cerberus [2].\\n\"\n",
" \"\\n\"\n",
" \"[1]: https://github.com/mozilla/moz-crash-rate-aggregates\\n\"\n",
" \"[2]: https://github.com/mozilla/cerberus\\n\"\n",
" ).format(now)\n",
" send_ses(FROM_ADDR, \"[FAILURE] Crash aggregates not updating\", email_body, GENERAL_TELEMETRY_ALERT)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.11"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
# coding: utf-8
# # Crash Rate Aggregates Watchdog
# Watches the output directory of the [crash rate aggregates](https://github.com/mozilla/moz-crash-rate-aggregates) job on S3 to make sure it's properly outputting results. If the crash rate aggregates job ever fails, this notebook detects that and sends out an alert email.
# Configuration options:
# In[ ]:
S3_BUCKET = "telemetry-parquet" # S3 bucket name
S3_PREFIX = "crash_aggregates/v1/" # must end with a slash
FROM_ADDR = "telemetry-alerts@mozilla.com" # email address to send alerts from
GENERAL_TELEMETRY_ALERT = "dev-telemetry-alerts@lists.mozilla.org" # email address that will receive notifications
# In[ ]:
import boto
from email.mime.application import MIMEApplication
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
def send_ses(fromaddr,
subject,
body,
recipient,
filename=''):
"""Send an email via the Amazon SES service.
Example:
send_ses('me@example.com, 'greetings', "Hi!", 'you@example.com)
Return:
If 'ErrorResponse' appears in the return message from SES,
return the message, otherwise return an empty '' string."""
msg = MIMEMultipart()
msg['Subject'] = subject
msg['From'] = fromaddr
msg['To'] = recipient
msg.attach(MIMEText(body))
if filename:
attachment = open(filename, 'rb').read()
part = MIMEApplication(attachment)
part.add_header('Content-Disposition', 'attachment', filename=filename)
msg.attach(part)
conn = boto.connect_ses()
result = conn.send_raw_email(msg.as_string())
return result if 'ErrorResponse' in result else ''
# In[ ]:
import os
import sys
import re
from datetime import datetime, date, timedelta
import boto
def print_help():
print "Check if the crash rate aggregator job is giving the expected output."
print "Usage: {} email|test".format(sys.argv[0])
print " {} email [YYYY-MM-DD] if crash aggregates haven't been updated in about a day as of YYYY-MM-DD (defaults to current date), email the telemetry alerts mailing list saying so".format(sys.argv[0])
print " {} test [YYYY-MM-DD] print out whether crash aggregates have been updated in about a day as of YYYY-MM-DD (defaults to current date)".format(sys.argv[0])
def is_job_failing(current_date):
# obtain the S3 bucket
conn = boto.s3.connect_to_region("us-west-2", host="s3-us-west-2.amazonaws.com")
try:
bucket = conn.get_bucket(S3_BUCKET, validate=False)
except boto.exception.S3ResponseError: # bucket doesn't exist
return True
# list all of the prefixes under the given one
crash_aggregate_partitions = bucket.list(prefix=S3_PREFIX, delimiter="/")
start, end = current_date - timedelta(days=2), current_date
for partition in crash_aggregate_partitions:
match = re.search(r"/submission_date=(\d\d\d\d-\d\d-\d\d)/$", partition.name)
if not match: continue
submission_date = datetime.strptime(match.group(1), "%Y-%m-%d").date()
if start <= submission_date <= end:
return False # found suitable partition, job is working
# no suitable partition found, job is failing
return True
# In[ ]:
now = date.today()
if is_job_failing(now):
print("Sending email notification about crash aggregates not being updated to {}.".format(GENERAL_TELEMETRY_ALERT))
email_body = (
"As of {}, the daily crash aggregates job [1] has not output results for 2 days. This is an automated message from Cerberus [2].\n"
"\n"
"[1]: https://github.com/mozilla/moz-crash-rate-aggregates\n"
"[2]: https://github.com/mozilla/cerberus\n"
).format(now)
send_ses(FROM_ADDR, "[FAILURE] Crash aggregates not updating", email_body, GENERAL_TELEMETRY_ALERT)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment