Created
September 6, 2013 15:44
-
-
Save jdm/6465718 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from mercurial import ui, hg, cmdutil, match | |
from collections import defaultdict | |
import json | |
import sys | |
repo = hg.repository(ui.ui(), sys.argv[1]) | |
from_rev = sys.argv[2] | |
to_rev = sys.argv[3] | |
employees = {} | |
def sanitize(s): | |
return s.replace(u"\u201c", '"').replace(u"\u201d", '"').replace(u"\u2018", "'").replace(u"\u2019", "'") | |
with open(sys.argv[4]) as f: | |
employees = json.load(f)[u'Report_Entry'] | |
employees = filter(lambda x: u'primaryWorkEmail' in x, employees) | |
emails = map(lambda x: x[u'primaryWorkEmail'], employees) | |
names = map(lambda x: (sanitize(x[u'Preferred_Name_-_First_Name']), | |
sanitize(x[u'Preferred_Name_-_Last_Name'])), | |
employees) | |
with open(sys.argv[5]) as f: | |
lines = f.readlines() | |
if not lines[-1].split(): | |
lines.pop() | |
emails += map(lambda x: x.split()[0], lines) | |
authors = defaultdict(int) | |
pats = () | |
opts = {'rev': [to_rev + ':' + from_rev]} | |
matchfn = match.match(repo.root, repo.getcwd(), pats) | |
def prep(ctx, fns): | |
rev = ctx.rev() | |
if len(repo.changelog.parentrevs(rev)) == 2: | |
return | |
for rev in cmdutil.walkchangerevs(repo, matchfn, opts, prep): | |
author = str(rev.user()).decode('utf-8') | |
authors[author] += 1 | |
employee_authors = set() | |
volunteer_authors = set() | |
partials = set() | |
for author in authors.keys(): | |
for (first, last) in names: | |
if last in author: | |
# Really dumb stemming - if the provided first name matches part of | |
# a "word" in the full author's line, claim it's a match (eg. Josh in Joshua) | |
if first in author or filter(lambda x: x in first, author.split()): | |
employee_authors.add(author) | |
try: | |
partials.remove(author) | |
except: | |
pass | |
break | |
else: | |
#print 'partial: %s vs %s' % (author, first + " " + last) | |
partials.add(author) | |
else: | |
for email in emails: | |
if email in author: | |
employee_authors.add(author) | |
try: | |
partials.remove(author) | |
except: | |
pass | |
break | |
else: | |
# Last ditch. I feel bad. | |
if '@mozilla.org' in author or '@mozilla.com' in author: | |
try: | |
partials.remove(author) | |
except: | |
pass | |
employee_authors.add(author) | |
else: | |
if author in partials: | |
#print 'partial: %s' % author | |
pass | |
volunteer_authors.add(author) | |
print 'Employees: %d' % len(employee_authors) | |
print 'Volunteers: %d' % len(volunteer_authors) | |
#print 'Partial matches: %d' % len(partials) | |
#print partials | |
emp_contributions = sum(map(lambda x: authors[x], filter(lambda x: x in employee_authors, authors))) | |
vol_contributions = sum(map(lambda x: authors[x], filter(lambda x: x in volunteer_authors, authors))) | |
print 'Employee contributions: %d' % emp_contributions | |
print 'Volunteer contributions: %d' % vol_contributions | |
sorted_volunteers = sorted(volunteer_authors, key=lambda x: authors[x], reverse=True) | |
sorted_employees = sorted(employee_authors, key=lambda x: authors[x], reverse=True) | |
N = 10 | |
top_n_vol = map(lambda x: float(authors[x]), sorted_volunteers[:N]) | |
top_n_emp = map(lambda x: float(authors[x]), sorted_employees[:N]) | |
print 'Contributions from top %d employees: %d' % (N, sum(top_n_emp)) | |
print 'Contributions from top %d volunteers: %d' % (N, sum(top_n_vol)) | |
print 'Top %d volunteers responsible for %f%% of volunteer commits, %f%% overall' % (N, sum(top_n_vol) / vol_contributions * 100, sum(top_n_vol) / (emp_contributions + vol_contributions) * 100) | |
print 'Top %d employees responsible for %f%% of employee commits, %f%% overall' % (N, sum(top_n_emp) / emp_contributions * 100, sum(top_n_emp) / (emp_contributions + vol_contributions) * 100) | |
print 'Volunteer commit distribution:' | |
volunteer_buckets = defaultdict(int) | |
for author in volunteer_authors: | |
volunteer_buckets[authors[author]] += 1 | |
for key in sorted(volunteer_buckets.keys(), reverse=False): | |
print '%s: %d' % (key, volunteer_buckets[key]) | |
assert sum(volunteer_buckets.values()) == len(volunteer_authors) | |
print 'Bucketed volunteer commit distribution:' | |
buckets = [(1, 2), (2, 3), (3, 4), (4, 5), (5, 10), (10, 20), (20, 2000)] | |
bucketed = [] | |
for i, (lower, higher) in enumerate(buckets): | |
bucketed += [0] | |
for subbucket in filter(lambda x: x >= lower and x < higher, volunteer_buckets.keys()): | |
bucketed[i] += volunteer_buckets[subbucket] | |
print "[%d, %d) - %d" % (lower, higher, bucketed[i]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment