Last active December 12, 2015 04:38
A python script to analyze churn on a local git repository based on different time intervals.
#! /usr/bin/env python
import sys, subprocess, os, pytz
from datetime import datetime, timedelta
from optparse import OptionParser
from dateutil.parser import parse as dateparse
def get_dates_and_shas(branch, start, end, interval):
"""Gets the relevant shas given the start date, end date and interval on
the given branch.
branch (str): name of the branch to perform the analysis on.
start (datetime): the start date. Defaults to the start of the project.
end (datetime): the end date. Defaults to now.
interval (int): the interval in days between the diffs.
list of (date, sha) tuples
shas = []
dates = []
if interval == 0:
# If interval is 0, simply get all the non-merge commits and find
# their dates
revlist = subprocess.Popen(['git', 'rev-list', '--since', str(start),
'--before', str(end), '--no-merges', branch],
# Read the SHAs to get the exact date they were committed onto the
# master branch
for sha in revlist.stdout.readlines():
dates.append(dateparse(subprocess.Popen(['git', 'show',
'--shortstat', sha[:-1]],
# If interval is not 0, we want to record the 'state' of the branch at
# a given time, which is the first sha on the branch before the given date.
current = end
while current >= (start + timedelta(days=interval)):
shas.append(subprocess.Popen(['git', 'rev-list', '-n1',
'--first-parent', '--before', str(current),
branch], stdout=subprocess.PIPE)
current = current - timedelta(days=interval)
# Add the start date so it doesn't matter whether interval precisely matches
# the amount of days in between start and end.
shas.append(subprocess.Popen(['git', 'rev-list', '-n1', '--first-parent',
'--before', str(start), branch],
return zip(dates, shas)
def get_churn_with_interval(dateshas, excludestr):
"""Calculates the total churn of commits described in dateshas after
filtering out the excluded paths.
dateshas (datetime, str): A tuple of datetimes and commit shas.
excludestr: A grep-friendly regular expression.
int: The cumulative amount of churned lines.
On stdout: a CSV of the form sha;date;churn.
print "sha1;date1;sha2;date2;churn" # CSV header line
total = 0
date, sha = dateshas[0]
for prevdate, prevsha in dateshas[1:]:
diff = None
if excludestr:
# See function get_churn_per_commit for an explanation per
# process call
files = subprocess.Popen(['git', 'diff', '-w', '-C',
'--name-status', '--format=format:',
prevsha, sha], stdout=subprocess.PIPE)
cut = subprocess.Popen(['cut', '-f2,3'], stdin=files.stdout,
grep = subprocess.Popen(['grep', '-v', excludestr],
stdin=cut.stdout, stdout=subprocess.PIPE)
xargs = subprocess.Popen(['xargs', '-L', '500', 'git', 'diff',
'-w', '-C', '--shortstat',
'--format=format:', prevsha, sha, '--',
'dummy'], stdin=grep.stdout,
diff = xargs.stdout.readlines()
files = subprocess.Popen(['git', 'diff', '-w', '-C',
'--shortstat', '--format=format:',
prevsha, sha], stdout=subprocess.PIPE)
diff = files.stdout.readlines()
# Remove leading/trailing newlines
diff = [x[:-1] for x in diff if x != '\n']
# Because of the xargs approach, there might be multiple result
# lines. Iterate over all of them and sum the churn. That is, if
# there are actually results left after directory filtering
churn = 0
for line in diff:
if len(line) > 0:
added = int(line.split()[3])
churn += added
total += churn
print "%s;%s;%s;%s;%d" % (prevsha[:8], str(prevdate), sha[:8], str(date), churn)
date, sha = prevdate, prevsha
return total
def get_churn_per_commit(dateshas, excludestr):
"""Calculates the total churn of commits described in dateshas after
filtering out the excluded paths.
dateshas (datetime, str): A tuple of datetimes and commit shas.
excludestr: A grep-friendly regular expression.
int: The cumulative amount of churned lines.
On stdout: a CSV of the form sha;date;churn.
print "sha;date;churn" # CSV header line
total = 0
for date, sha in dateshas:
commit = None
if excludestr:
# Example command with filtering:
# git show abcde -w -C --name-status --format=format:
# Outputs all the changed files with just their filenames,
# as paths from the repository root. -w flag ignores
# whitespace differences, -C flag detects move moves and
# renames and ignores those.
# cut -f2,3:
# Cuts out the filename (column 2) and the rename
# destination (column 3, if exists). This is done to not
# have the M/A/D/R modification indicator from the
# --name-status output.
# grep -v '^Documentation/':
# Filters out all the files which are in the specified
# folders.
# xargs -L 500 git show abcde -w -C --shortstat -- dummy
# xargs carries all the files that grep outputs over to git
# show, which formats the result into a line of the form
# 'X files changed, Y insertions(+), Z deletions(-)'.
# Using xargs because OS X has a wonky and unpredictable
# argument list length limit, so this should makes the
# script more portable. 'dummy' is specified to ensure an
# empty set from grep does not lead to 'git show' showing
# everything.
show = subprocess.Popen(['git', 'show', sha, '-w', '-C',
'--name-status', '--format=format:'],
cut = subprocess.Popen(['cut', '-f2,3'], stdin=show.stdout,
grep = subprocess.Popen(['grep', '-v', excludestr],
stdin=cut.stdout, stdout=subprocess.PIPE)
xargs = subprocess.Popen(['xargs', '-L', '500', 'git', 'show',
sha, '-w', '-C', '--shortstat',
'--format=format:', '--', 'dummy'],
stdin=grep.stdout, stdout=subprocess.PIPE)
commit = xargs.stdout.readlines()
# If there is no excludestr, we can simply ask for the shortstat
# information.
show = subprocess.Popen(['git', 'show', sha, '-w', '-C',
'--shortstat', '--format=format:'],
commit = show.stdout.readlines()
# Remove leading/trailing newlines
commit = [x[:-1] for x in commit if x != '\n']
# Because of the xargs approach, there might be multiple result
# lines. Iterate over all of them and sum the churn. That is, if there
# are actually results left after directory filtering
churn = 0
for line in commit:
if len(line) > 0:
added = int(line.split()[3])
added = 0
churn += added
if churn > 0:
total += churn
print "%s;%s;%d" % (sha[:8],str(date), churn)
return total
def vararg_callback(option, opt_str, value, parser):
"""Function vararg_callback
An extention on OptParser to parse a varying amount of argument, in this
case used for the -x flag.
assert value is None
value = []
def floatable(str):
return True
except ValueError:
return False
for arg in parser.rargs:
# Stop on options like --foo
if arg[:2] == "--" and len(arg) > 2:
# Stop on -a, but not on negative numbers
if arg[:1] == "-" and len(arg) > 1 and not floatable(arg):
del parser.rargs[:len(value)]
setattr(parser.values, option.dest, value)
if __name__ == '__main__':
parser = OptionParser("Usage: %prog [options] <path> <branch>")
parser.add_option("-s", "--start", dest="start", help="Start date to check from, format DD-MM-YYYY")
parser.add_option("-e", "--end", dest="end", help="End date to stop check at, format DD-MM-YYYY")
parser.add_option("-i", "--interval", dest="interval", help="Number of days in between considered commits")
parser.add_option("-x", "--exclude", dest="exclude", action="callback", callback=vararg_callback, help="Folders to be excluded")
(options, args) = parser.parse_args()
if len(args) < 2:
print "Usage: %s [options] <path> <branch>" % sys.argv[0]
start, end, interval, exclude_dirs = None, None, 0, []
if options.start:
start = datetime.strptime(options.start, "%d-%m-%Y")
start = pytz.UTC.localize(start)
if options.end:
end = datetime.strptime(options.end, "%d-%m-%Y")
end = pytz.UTC.localize(end)
if options.interval:
interval = int(options.interval)
if interval < 0:
print "Sorry, interval can only be 0 or larger"
if options.exclude:
exclude_dirs = options.exclude
# Create the exclude regular expression
excludestr = ("^{}\|"*len(exclude_dirs))[:-2].format(*exclude_dirs)
# Change the cwd so all commands are run in the correct folder
dateshas = get_dates_and_shas(args[1], start, end, interval)
if interval == 0:
total = get_churn_per_commit(dateshas, excludestr if exclude_dirs else "")
print "Total churn between %s and %s, based on individual commits: %d" % (str(start), str(end), total)
total = get_churn_with_interval(dateshas, excludestr if exclude_dirs else "")
print "Total churn between %s and %s, with intervals of %d days: %d" % (str(start), str(end), interval, total)
