Skip to content

Instantly share code, notes, and snippets.

Created March 17, 2017 15:14
Show Gist options
  • Save willkg/25e28570fd8c95537dbd7f9e2855c7c8 to your computer and use it in GitHub Desktop.
Save willkg/25e28570fd8c95537dbd7f9e2855c7c8 to your computer and use it in GitHub Desktop.
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at
"""To run:
1. ``mkvirtualenv --python=/usr/bin/python3 crashids``
2. ``pip install -r requirements.txt``
This gets crash ids before and after a certain build, pulls down their
``upload_file_minidump`` files from s3, then compares all the files and prints
out rough analysis.
import logging
import os
import pathlib
import sys
import boto3
from botocore.client import Config
import requests
BUCKET_NAME = 'org.mozilla.crash-stats.production.crashes'
REGION = 'us-west-2'
HIT_TMPL = '%(date)-32s %(product)-10s %(version)-10s %(uuid)-20s'
FILENAME_TMPL = 'v1/dump/%(crashid)s'
def build_s3_client(access_key, secret_access_key):
session = boto3.session.Session(aws_access_key_id=access_key, aws_secret_access_key=secret_access_key)
client = session.client(
config=Config(s3={'addression_style': 'path'})
return client
def get_by_query(query):
url = SUPERSEARCH_URL + '?' + '&'.join(query)
return requests.get(url)
def print_results(results):
"""Print the results of the SuperSearch query
This helps to make sure the query is correct and we're getting back
appropriate results.
print(HIT_TMPL % {'date': 'date', 'product': 'product', 'version': 'version', 'uuid': 'uuid'})
for res in results:
print(HIT_TMPL % res)
def fetch_and_save(s3_client, dir_, hits):
"""Fetch the dumps for the specified crashes and save them in specified
if not os.path.exists(dir_):
for hit in hits:
crashid = hit['uuid']
fn = os.path.join(dir_, crashid)
if os.path.exists(fn):
print('already exists %s' % fn)
print('fetching %s upload_file_minidump...' % crashid)
with open(fn, 'wb') as fp:
resp = s3_client.get_object(
Key=FILENAME_TMPL % {'crashid': crashid}
def analyze(dir_):
"""Analyze the files in the directory and print stats to stdout"""
path = pathlib.Path(dir_)
files = [(f, f.stat().st_size) for f in path.glob('**/*') if f.is_file()]
files.sort(key=lambda part: part[1])
print(' Number of files: %10d' % len(files))
print(' Average size: %10d' % (sum([f[1] for f in files]) / len(files)))
print(' Median size: %10d' % files[int(len(files) / 2)][1])
print(' 95%% size: %10d' % files[int(len(files) * 0.95)][1])
print(' Max size: %10d' % files[-1][1])
def main(args):
access_key, secret_access_key = args
# Build an S3 client which we'll use to pull down dump files
s3_client = build_s3_client(access_key, secret_access_key)
# Get all crash ids that:
# - product: Firefox
# - channel: nightly
# - OS: windows
# - build id < 20170209030214 vs. build id >= 20170209030214
# - 2/1 to 2/28
# We get some crashes that match our criteria per day for a 10 day range.
# This is the "before the change" set.
for day in range(1, 10):
date = '2017-02-%02d' % day
before_query = (
'platform=Windows NT',
'date=>' + date,
'date=<2017-02-%02d' % (day + 1),
'_results_number=%d' % RESULTS,
resp = get_by_query(before_query)
hits = resp.json()['hits']
fetch_and_save(s3_client, os.path.join('.', 'before', date), hits)
# We get some crashes that match our criteria per day for a 10 day range.
# This is the "after the change" set.
for day in range(10, 19):
date = '2017-02-%02d' % day
after_query = (
'platform=Windows NT',
'date=>' + date,
'date=<2017-02-%02d' % (day + 1),
'_results_number=%d' % RESULTS,
resp = get_by_query(after_query)
hits = resp.json()['hits']
fetch_and_save(s3_client, os.path.join('.', 'after', date), hits)
# Analyze the before and after sets--these print to stdout
analyze(os.path.join('.', 'before'))
analyze(os.path.join('.', 'after'))
if __name__ == '__main__':
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment