jdm/gist:6465718

## gistfile1.txt
from mercurial import ui, hg, cmdutil, match
from collections import defaultdict
import json
import sys

repo = hg.repository(ui.ui(), sys.argv[1])
from_rev = sys.argv[2]
to_rev = sys.argv[3]
employees = {}

def sanitize(s):
    return s.replace(u"\u201c", '"').replace(u"\u201d", '"').replace(u"\u2018", "'").replace(u"\u2019", "'")

with open(sys.argv[4]) as f:
    employees = json.load(f)[u'Report_Entry']
    employees = filter(lambda x: u'primaryWorkEmail' in x, employees)
    emails = map(lambda x: x[u'primaryWorkEmail'], employees)
    names = map(lambda x: (sanitize(x[u'Preferred_Name_-_First_Name']),
                      sanitize(x[u'Preferred_Name_-_Last_Name'])),
                employees)

with open(sys.argv[5]) as f:
    lines = f.readlines()
    if not lines[-1].split():
        lines.pop()
    emails += map(lambda x: x.split()[0], lines)

authors = defaultdict(int)
pats = ()
opts = {'rev': [to_rev + ':' + from_rev]}
matchfn = match.match(repo.root, repo.getcwd(), pats)
def prep(ctx, fns):
    rev = ctx.rev()
    if len(repo.changelog.parentrevs(rev)) == 2:
        return

for rev in cmdutil.walkchangerevs(repo, matchfn, opts, prep):
    author = str(rev.user()).decode('utf-8')
    authors[author] += 1

employee_authors = set()
volunteer_authors = set()
partials = set()
for author in authors.keys():
    for (first, last) in names:
        if last in author:
            # Really dumb stemming - if the provided first name matches part of
            # a "word" in the full author's line, claim it's a match (eg. Josh in Joshua)
            if first in author or filter(lambda x: x in first, author.split()):
                employee_authors.add(author)
                try:
                    partials.remove(author)
                except:
                    pass
                break
            else:
                #print 'partial: %s vs %s' % (author, first + " " + last)
                partials.add(author)
    else:
        for email in emails:
            if email in author:
                employee_authors.add(author)
                try:
                    partials.remove(author)
                except:
                    pass
                break
        else:
            # Last ditch. I feel bad.
            if '@mozilla.org' in author or '@mozilla.com' in author:
                try:
                    partials.remove(author)
                except:
                    pass
                employee_authors.add(author)
            else:
                if author in partials:
                    #print 'partial: %s' % author
                    pass
                volunteer_authors.add(author)

print 'Employees: %d' % len(employee_authors)
print 'Volunteers: %d' % len(volunteer_authors)
#print 'Partial matches: %d' % len(partials)
#print partials

emp_contributions = sum(map(lambda x: authors[x], filter(lambda x: x in employee_authors, authors)))
vol_contributions = sum(map(lambda x: authors[x], filter(lambda x: x in volunteer_authors, authors)))

print 'Employee contributions: %d' % emp_contributions
print 'Volunteer contributions: %d' % vol_contributions
sorted_volunteers = sorted(volunteer_authors, key=lambda x: authors[x], reverse=True)
sorted_employees = sorted(employee_authors, key=lambda x: authors[x], reverse=True)
N = 10
top_n_vol = map(lambda x: float(authors[x]), sorted_volunteers[:N])
top_n_emp = map(lambda x: float(authors[x]), sorted_employees[:N])
print 'Contributions from top %d employees: %d' % (N, sum(top_n_emp))
print 'Contributions from top %d volunteers: %d' % (N, sum(top_n_vol))
print 'Top %d volunteers responsible for %f%% of volunteer commits, %f%% overall' % (N, sum(top_n_vol) / vol_contributions * 100, sum(top_n_vol) / (emp_contributions + vol_contributions) * 100)
print 'Top %d employees responsible for %f%% of employee commits, %f%% overall' % (N, sum(top_n_emp) / emp_contributions * 100, sum(top_n_emp) / (emp_contributions + vol_contributions) * 100)

print 'Volunteer commit distribution:'
volunteer_buckets = defaultdict(int)
for author in volunteer_authors:
    volunteer_buckets[authors[author]] += 1
for key in sorted(volunteer_buckets.keys(), reverse=False):
    print '%s: %d' % (key, volunteer_buckets[key])
assert sum(volunteer_buckets.values()) == len(volunteer_authors)

print 'Bucketed volunteer commit distribution:'
buckets = [(1, 2), (2, 3), (3, 4), (4, 5), (5, 10), (10, 20), (20, 2000)]
bucketed = []
for i, (lower, higher) in enumerate(buckets):
    bucketed += [0]
    for subbucket in filter(lambda x: x >= lower and x < higher, volunteer_buckets.keys()):
        bucketed[i] += volunteer_buckets[subbucket]
    print "[%d, %d) - %d" % (lower, higher, bucketed[i])
	from mercurial import ui, hg, cmdutil, match
	from collections import defaultdict
	import json
	import sys

	repo = hg.repository(ui.ui(), sys.argv[1])
	from_rev = sys.argv[2]
	to_rev = sys.argv[3]
	employees = {}

	def sanitize(s):
	return s.replace(u"\u201c", '"').replace(u"\u201d", '"').replace(u"\u2018", "'").replace(u"\u2019", "'")

	with open(sys.argv[4]) as f:
	employees = json.load(f)[u'Report_Entry']
	employees = filter(lambda x: u'primaryWorkEmail' in x, employees)
	emails = map(lambda x: x[u'primaryWorkEmail'], employees)
	names = map(lambda x: (sanitize(x[u'Preferred_Name_-_First_Name']),
	sanitize(x[u'Preferred_Name_-_Last_Name'])),
	employees)

	with open(sys.argv[5]) as f:
	lines = f.readlines()
	if not lines[-1].split():
	lines.pop()
	emails += map(lambda x: x.split()[0], lines)

	authors = defaultdict(int)
	pats = ()
	opts = {'rev': [to_rev + ':' + from_rev]}
	matchfn = match.match(repo.root, repo.getcwd(), pats)
	def prep(ctx, fns):
	rev = ctx.rev()
	if len(repo.changelog.parentrevs(rev)) == 2:
	return

	for rev in cmdutil.walkchangerevs(repo, matchfn, opts, prep):
	author = str(rev.user()).decode('utf-8')
	authors[author] += 1

	employee_authors = set()
	volunteer_authors = set()
	partials = set()
	for author in authors.keys():
	for (first, last) in names:
	if last in author:
	# Really dumb stemming - if the provided first name matches part of
	# a "word" in the full author's line, claim it's a match (eg. Josh in Joshua)
	if first in author or filter(lambda x: x in first, author.split()):
	employee_authors.add(author)
	try:
	partials.remove(author)
	except:
	pass
	break
	else:
	#print 'partial: %s vs %s' % (author, first + " " + last)
	partials.add(author)
	else:
	for email in emails:
	if email in author:
	employee_authors.add(author)
	try:
	partials.remove(author)
	except:
	pass
	break
	else:
	# Last ditch. I feel bad.
	if '@mozilla.org' in author or '@mozilla.com' in author:
	try:
	partials.remove(author)
	except:
	pass
	employee_authors.add(author)
	else:
	if author in partials:
	#print 'partial: %s' % author
	pass
	volunteer_authors.add(author)

	print 'Employees: %d' % len(employee_authors)
	print 'Volunteers: %d' % len(volunteer_authors)
	#print 'Partial matches: %d' % len(partials)
	#print partials

	emp_contributions = sum(map(lambda x: authors[x], filter(lambda x: x in employee_authors, authors)))
	vol_contributions = sum(map(lambda x: authors[x], filter(lambda x: x in volunteer_authors, authors)))

	print 'Employee contributions: %d' % emp_contributions
	print 'Volunteer contributions: %d' % vol_contributions
	sorted_volunteers = sorted(volunteer_authors, key=lambda x: authors[x], reverse=True)
	sorted_employees = sorted(employee_authors, key=lambda x: authors[x], reverse=True)
	N = 10
	top_n_vol = map(lambda x: float(authors[x]), sorted_volunteers[:N])
	top_n_emp = map(lambda x: float(authors[x]), sorted_employees[:N])
	print 'Contributions from top %d employees: %d' % (N, sum(top_n_emp))
	print 'Contributions from top %d volunteers: %d' % (N, sum(top_n_vol))
	print 'Top %d volunteers responsible for %f%% of volunteer commits, %f%% overall' % (N, sum(top_n_vol) / vol_contributions * 100, sum(top_n_vol) / (emp_contributions + vol_contributions) * 100)
	print 'Top %d employees responsible for %f%% of employee commits, %f%% overall' % (N, sum(top_n_emp) / emp_contributions * 100, sum(top_n_emp) / (emp_contributions + vol_contributions) * 100)

	print 'Volunteer commit distribution:'
	volunteer_buckets = defaultdict(int)
	for author in volunteer_authors:
	volunteer_buckets[authors[author]] += 1
	for key in sorted(volunteer_buckets.keys(), reverse=False):
	print '%s: %d' % (key, volunteer_buckets[key])
	assert sum(volunteer_buckets.values()) == len(volunteer_authors)

	print 'Bucketed volunteer commit distribution:'
	buckets = [(1, 2), (2, 3), (3, 4), (4, 5), (5, 10), (10, 20), (20, 2000)]
	bucketed = []
	for i, (lower, higher) in enumerate(buckets):
	bucketed += [0]
	for subbucket in filter(lambda x: x >= lower and x < higher, volunteer_buckets.keys()):
	bucketed[i] += volunteer_buckets[subbucket]
	print "[%d, %d) - %d" % (lower, higher, bucketed[i])