Skip to content

Instantly share code, notes, and snippets.

@Uberi
Created April 14, 2016 16:55
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Uberi/f17a3647ea3f9d087c576e8fe5b112c0 to your computer and use it in GitHub Desktop.
Save Uberi/f17a3647ea3f9d087c576e8fe5b112c0 to your computer and use it in GitHub Desktop.
crash-rate-aggregates-watchdog
Display the source blob
Display the rendered blob
Raw
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
# coding: utf-8
# # Crash Rate Aggregates Watchdog
# Watches the output directory of the [crash rate aggregates](https://github.com/mozilla/moz-crash-rate-aggregates) job on S3 to make sure it's properly outputting results. If the crash rate aggregates job ever fails, this notebook detects that and sends out an alert email.
# Configuration options:
# In[ ]:
S3_BUCKET = "telemetry-parquet" # S3 bucket name
S3_PREFIX = "crash_aggregates/v1/" # must end with a slash
FROM_ADDR = "telemetry-alerts@mozilla.com" # email address to send alerts from
GENERAL_TELEMETRY_ALERT = "dev-telemetry-alerts@lists.mozilla.org" # email address that will receive notifications
# In[ ]:
import boto
from email.mime.application import MIMEApplication
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
def send_ses(fromaddr,
subject,
body,
recipient,
filename=''):
"""Send an email via the Amazon SES service.
Example:
send_ses('me@example.com, 'greetings', "Hi!", 'you@example.com)
Return:
If 'ErrorResponse' appears in the return message from SES,
return the message, otherwise return an empty '' string."""
msg = MIMEMultipart()
msg['Subject'] = subject
msg['From'] = fromaddr
msg['To'] = recipient
msg.attach(MIMEText(body))
if filename:
attachment = open(filename, 'rb').read()
part = MIMEApplication(attachment)
part.add_header('Content-Disposition', 'attachment', filename=filename)
msg.attach(part)
conn = boto.connect_ses()
result = conn.send_raw_email(msg.as_string())
return result if 'ErrorResponse' in result else ''
# In[ ]:
import os
import sys
import re
from datetime import datetime, date, timedelta
import boto
def print_help():
print "Check if the crash rate aggregator job is giving the expected output."
print "Usage: {} email|test".format(sys.argv[0])
print " {} email [YYYY-MM-DD] if crash aggregates haven't been updated in about a day as of YYYY-MM-DD (defaults to current date), email the telemetry alerts mailing list saying so".format(sys.argv[0])
print " {} test [YYYY-MM-DD] print out whether crash aggregates have been updated in about a day as of YYYY-MM-DD (defaults to current date)".format(sys.argv[0])
def is_job_failing(current_date):
# obtain the S3 bucket
conn = boto.s3.connect_to_region("us-west-2", host="s3-us-west-2.amazonaws.com")
try:
bucket = conn.get_bucket(S3_BUCKET, validate=False)
except boto.exception.S3ResponseError: # bucket doesn't exist
return True
# list all of the prefixes under the given one
crash_aggregate_partitions = bucket.list(prefix=S3_PREFIX, delimiter="/")
start, end = current_date - timedelta(days=2), current_date
for partition in crash_aggregate_partitions:
match = re.search(r"/submission_date=(\d\d\d\d-\d\d-\d\d)/$", partition.name)
if not match: continue
submission_date = datetime.strptime(match.group(1), "%Y-%m-%d").date()
if start <= submission_date <= end:
return False # found suitable partition, job is working
# no suitable partition found, job is failing
return True
# In[ ]:
now = date.today()
if is_job_failing(now):
print("Sending email notification about crash aggregates not being updated to {}.".format(GENERAL_TELEMETRY_ALERT))
email_body = (
"As of {}, the daily crash aggregates job [1] has not output results for 2 days. This is an automated message from Cerberus [2].\n"
"\n"
"[1]: https://github.com/mozilla/moz-crash-rate-aggregates\n"
"[2]: https://github.com/mozilla/cerberus\n"
).format(now)
send_ses(FROM_ADDR, "[FAILURE] Crash aggregates not updating", email_body, GENERAL_TELEMETRY_ALERT)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment