Skip to content

Instantly share code, notes, and snippets.

@isaacl
Created November 17, 2013 02:11
Show Gist options
  • Save isaacl/7508209 to your computer and use it in GitHub Desktop.
Save isaacl/7508209 to your computer and use it in GitHub Desktop.
Commit queue stats
#!/usr/bin/python
# sudo apt-get install pip; sudo pip install requests request_cache
import argparse
import collections
from datetime import timedelta, datetime
import itertools
import json
import numpy
import operator
import re
import requests
import requests_cache
import time
import sys
import multiprocessing
parser = argparse.ArgumentParser()
parser.add_argument("--clear", help="clear the cache before starting", action="store_true")
parser.add_argument("--progress", help="Show progres", action="store_true")
parser.add_argument("--base-url-stats", help="print base_url matching stats", action="store_true")
parser.add_argument("--chrome", action='store_true')
parser.add_argument('--count', type=int, default=2000, help='number of issues to fetch')
args = parser.parse_args()
def make_throttle_hook(timeout=1.0):
def hook(response, *args, **kwargs):
print response.url
if not getattr(response, 'from_cache', False):
print "sleeping"
time.sleep(timeout)
return response
return hook
# Rietveld allows up to 1000 results per search query. I don't know if that's better for the server or not.
per_request_search_limit = min(int(args.count/2), 1000)
# Currently looking at the last 2000 entries.
target_count = args.count
# Only looking at looking at Blink patches (eventually we should support switching these):
if args.chrome:
base_url_shortname = "chrome"
target_base_url_regexps = [
re.compile(r"^svn://svn\.chromium\.org/chrome/trunk/src"),
re.compile(r"^https://chromium\.googlesource\.com/chromium/chromium/src(?:\.git)?(?:@[\w\-]+)?$"),
re.compile(r"^http://src\.chromium\.org/chrome/trunk/src"),
]
else:
base_url_shortname = "blink"
target_base_url_regexps = [
re.compile(r"^svn://svn\.chromium\.org/blink/(?:trunk/?)?$"),
re.compile(r"^https://chromium\.googlesource\.com/chromium/blink(?:\.git)?(?:@[\w\-]+)?$"),
re.compile(r"^http://src\.chromium\.org/blink/(?:trunk/?)?$"),
]
# It seems reasonable to expect that rietveld can handle 10 query-per-second load from this script.
# Do not reduce this value with asking someone on the infrastructure team first.
seconds_between_requests = 0.1
requests_cache.install_cache('cq')
session = requests_cache.CachedSession()
if args.clear:
print "Clearing cache..."
session.cache.clear()
session.hooks = {'response': make_throttle_hook(seconds_between_requests)}
def fetch_recent_issues(limit, cursor=None):
search_url = "https://codereview.chromium.org/search?format=json&limit=%d" % limit
if cursor:
search_url += "&cursor=%s" % cursor
response = session.get(search_url)
return response.json()
def matches_target_base_url(url):
for regexp in target_base_url_regexps:
match = regexp.match(url)
if match:
return True
return False
def issue_patch_string(issue_id, patch_id):
return "%s:%s" % (issue_id, patch_id)
def issue_display_url(issue_id):
return "https://codereview.chromium.org/%s" % issue_id
def issue_url(issue_id):
return "https://codereview.chromium.org/api/%s?messages=true" % issue_id
def date_from_iso_str(iso_str):
# 2013-10-17 16:43:04.391480
return datetime.strptime(iso_str, "%Y-%m-%d %H:%M:%S.%f")
def round_timedelta(delta):
# We never care about the milliseconds when printing timedeltas:
return timedelta(seconds=round(delta.total_seconds()))
def fetch_issue(issue_id):
url = issue_url(issue_id)
json = session.get(url).json()
cache_key = session.cache._url_to_key(url)
_, now = session.cache.get_response_and_time(cache_key)
return json, now
class CQAttempt(object):
def __init__(self):
self.result = None
self.patch_id = None
self.issue_id = None
self.author = None
self.start_time = None
self.end_time = None
@property
def issue_patch_string(self):
return issue_patch_string(self.issue_id, self.patch_id)
@property
def duration(self):
return self.end_time - self.start_time
class CQAnalyzer(object):
def __init__(self):
self.state = None
self.issue_id = None
self.author = None
self.patch_id = None
self.start_time = None
self.attempts = []
self.manual_commits = set()
self.no_try_issues = set()
# issue_id -> time since CQ start
self.in_progress = {}
start_regexp = re.compile("""CQ is trying da patch. Follow status at
https://chromium-status.appspot.com/cq/(?P<author>.*)/(?P<issue_id>\d+)/(?P<patch_id>\d+)""")
committed_regexp = re.compile("Change committed as (?P<revision>\d+)")
manual_commit_regexp = re.compile("Committed patchset #(?P<patch_id>\d+) manually as r(?P<revision>\d+)")
def _reset(self):
self.state = None
self.author = None
self.patch_id = None
self.start_time = None
def _start(self, patch_id, start_time, author=None):
if self.state == "START" and self.patch_id == patch_id:
#print "Restarted for patchset %s on %s" % (patch_id, issue_display_url(self.issue_id))
self._end("RESTART", start_time)
self.state = "START"
self.patch_id = patch_id
self.author = author
assert start_time
self.start_time = start_time
def _end(self, result, end_time):
if self.state != "START":
print "ERROR: %s for %s without start??" % (result, issue_display_url(self.issue_id))
self._reset()
return
attempt = CQAttempt()
attempt.result = result
attempt.patch_id = self.patch_id
attempt.issue_id = self.issue_id
attempt.start_time = self.start_time
attempt.end_time = end_time
self.attempts.append(attempt)
self._reset()
def _start_and_end(self, result, date):
self._start(result, date)
self._end(result, date)
def _manual_commit(self, patch_id):
self.manual_commits.add(issue_patch_string(self.issue_id, patch_id))
def process_message(self, message):
text = message['text']
date = date_from_iso_str(message['date'])
match = self.manual_commit_regexp.match(text)
if match:
if self.start_time:
self._end("MANUAL_COMMIT", date_from_iso_str(self.issue['modified']))
patch_number = int(match.group('patch_id'))
# gcl dcommit talks in terms of patch numbers 1-5, we need to map those to patchset ids:
patch_id = self.issue['patchsets'][patch_number - 1]
self._manual_commit(patch_id)
return
if message['sender'] != 'commit-bot@chromium.org':
return
match = self.start_regexp.match(text)
if match:
self._start(match.group('patch_id'), date, match.group('author'))
return
match = self.committed_regexp.match(text)
if match:
self._end("SUCCESS", date)
return
cq_end_messages = {
"TRY": "Retried try job",
"APPLY": "Failed to apply patch",
"COMPILE": "Sorry for I got bad news for ya",
"DESCRIPTION_CHANGED": "Commit queue rejected this change because the description",
"REVIEWERS_CHANGED": "List of reviewers changed.", # This is too conservative.
"PATCH_CHANGED": "Commit queue failed due to new patchset.", # User caused.
# FAILED_TO_TRIGGER is a very serious failure, unclear why it happens!
"FAILED_TO_TRIGGER": "Failed to trigger a try job on",
# BINARY_FILE is just a bug in the CQ.
"BINARY_FILE": "Can't process patch for file",
# Unclear why UPDATE_STEP happens. Likely broken bots, shouldn't fail patches!
"UPDATE_STEP": "Step \"update\" is always a major failure.",
}
for result, message_text in cq_end_messages.items():
if text.startswith(message_text):
self._end(result, date)
return
cq_fail_to_start_messsages = {
"NO_COMMENTS": "No comments yet.",
"NO_LGTM": "No LGTM from a valid reviewer yet.",
}
for result, message_text in cq_fail_to_start_messsages.items():
if text.startswith(message_text):
self._start_and_end(result, date)
return
print "Error %s has unknown message: %s" % (issue_display_url(self.issue_id), text)
self._end("UNKNOWN", date)
def start_issue(self, issue, now):
# We may be dealing with cached data, so cache our concept of "now"
self.now = now
self.issue = issue
self.issue_id = issue['issue']
def end_issue(self, issue):
issue_id = issue['issue']
assert issue_id == self.issue_id
if self.start_time:
if self.issue['commit']:
print "%s in progress? %s" % (round_timedelta(self.now - self.start_time), issue_display_url(self.issue_id))
self._end("IN_PROGRESS", self.now)
else:
# There are various reasons why the commit box may have been unchecked, including that
# the issue may be closed now. If those are not from a user, then that's a bug in rietveld
# and out of scope for this script. For now we're just ignoring cases where the box was unchecked.
# We could log an _end() event, but which time would we use? last_modified has at best a weak
# corolation with the issue close or CQ checkbox.
self._reset()
# _reset() shouldn't be needed, but just in case.
self._reset()
self.issue_id = None
self.issue = None
def process_issue(self, issue, now):
issue_id = issue['issue']
if re.search('^NOTRY=[tT][rR][uU][eE]$', issue['description'], re.M):
self.no_try_issues.add(issue_id)
analyzer.start_issue(issue, now)
for message in issue['messages']:
analyzer.process_message(message)
# print "%(date)s: %(text)s" % message
analyzer.end_issue(issue)
# FIXME: Why am I writing my own table formating? This must exist in python...
def format_line(self, values, aligns, widths):
values = map(str, values)
args = zip(values, aligns, widths)
return " ".join([apply(align, (value, width)) for value, align, width in args])
def print_attempts_table(self):
result_getter = operator.attrgetter('result')
attempts = sorted(self.attempts, key=result_getter)
col_aligns = (str.rjust, str.ljust, str.rjust, str.rjust)
col_widths = (5, 20, 16, 16)
col_headers = ("count", "result", "mean duration", "median duration")
print
print "CQ Attempts by result (there can be more than one attempt on the same patch)"
print self.format_line(col_headers, col_aligns, col_widths)
print "-" * (sum(col_widths) + len(col_widths) - 1)
for result, group_itr in itertools.groupby(attempts, result_getter):
group = list(group_itr)
durations = map(operator.attrgetter('duration'), group)
mean = sum(durations, timedelta()) / len(group)
durations_seconds = map(timedelta.total_seconds, durations)
median_seconds = numpy.median(durations_seconds)
median = timedelta(seconds=round(median_seconds))
values = (len(group), result, round_timedelta(mean), median)
print self.format_line(values, col_aligns, col_widths)
print
print "Fetching and analyzing %d most recent issues..." % target_count
analyzer = CQAnalyzer()
total_fetch_count = 0
log_every = 100
issue_count = 0
cursor = None
url_counts = collections.Counter()
matching_issue_ids = []
while issue_count < target_count:
search_results = fetch_recent_issues(per_request_search_limit, cursor)
cursor = search_results['cursor']
total_fetch_count += per_request_search_limit
if args.base_url_stats:
url_counts.update([result['base_url'] for result in search_results['results']])
for result in search_results['results']:
if not matches_target_base_url(result['base_url']):
continue
issue_count += 1
if args.progress and (issue_count % log_every == 0):
print "%d/%d" % (issue_count, target_count)
matching_issue_ids.append(result['issue'])
#for i, issue_id in enumerate(matching_issue_ids):
# if i % 100 == 0:
# print 'Now fetching', i
# issue, now = fetch_issue(issue_id)
pool = multiprocessing.Pool(100)
for issue, now in pool.map_async(fetch_issue, matching_issue_ids).get(99999):
analyzer.process_issue(issue, now)
if args.base_url_stats:
print
for url, count in url_counts.most_common():
matches_target = matches_target_base_url(url)
print "%s %s %s" % (matches_target, count, url)
print
print "Found %s %s issues (in the %s most recent issues)." % (issue_count, base_url_shortname, total_fetch_count)
cq_issues = set([attempt.issue_id for attempt in analyzer.attempts])
cq_issue_count = len(cq_issues)
cq_patches = set([attempt.issue_patch_string for attempt in analyzer.attempts])
cq_patch_count = len(cq_patches)
print "%s issues (%d%%) (%d patches) tried the CQ, resulting in %d CQ attempts." % (
cq_issue_count, 100 * cq_issue_count / issue_count, cq_patch_count, len(analyzer.attempts))
no_try_issues = len(cq_issues & analyzer.no_try_issues)
gave_up_patches = len(cq_patches & analyzer.manual_commits)
print "%s of those (%d%%) were committed manually (gave up). %s (%d%%) were marked NOTRY." % (
gave_up_patches, 100 * gave_up_patches / cq_issue_count,
no_try_issues, 100 * no_try_issues / cq_issue_count)
analyzer.print_attempts_table()
identifier_getter = operator.attrgetter('issue_patch_string')
attempts = sorted(analyzer.attempts, key=identifier_getter)
extra_user_clicks = 0
eventually_success_durations = []
eventually_success_attempts = []
for identifier, group_itr in itertools.groupby(attempts, identifier_getter):
group = list(group_itr)
if group[0].issue_id in analyzer.no_try_issues:
continue
results = map(operator.attrgetter('result'), group)
# print "%s(%s): %s %s" % (identifier, len(group), total_time, results)
if "SUCCESS" in results:
# "RESTART" events may not require a user re-click?
user_initiated = filter(lambda result: result != "RESTART", results)
extra_user_clicks += len(user_initiated) - 1
durations = map(operator.attrgetter('duration'), group)
eventually_success_durations.append(sum(durations, timedelta()).total_seconds())
eventually_success_attempts.append(len(group))
print "%s CQ starts (%d%%) were due a previous false-negative (CQ landed a patch it previously rejected)" % (extra_user_clicks, 100 * extra_user_clicks / len(analyzer.attempts))
print 'Patches which eventually land percentiles:'
for i in [10, 25, 50, 75, 90, 95, 99]:
eventually_success_durations_p = round(numpy.percentile(eventually_success_durations, i)/(60*60), 3)
eventually_success_attempts_p = numpy.percentile(eventually_success_attempts, i)
print "%d: %.3f hrs" % (i, eventually_success_durations_p)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment