Skip to content

Instantly share code, notes, and snippets.

@kulmajaba
Last active April 11, 2022 07:43
Show Gist options
  • Save kulmajaba/ee7cfdeffaaeccd7e835e349ac2f84c8 to your computer and use it in GitHub Desktop.
Save kulmajaba/ee7cfdeffaaeccd7e835e349ac2f84c8 to your computer and use it in GitHub Desktop.
Python script for parsing lines changed per commit for a git repository and saving as time-series data in a CSV file
import argparse
import subprocess
import re
import csv
from datetime import datetime
from zoneinfo import ZoneInfo
from enum import Enum
from operator import add
# Pattern for paths and files to include, use git pathspec
pathspec = '*.tex'
# Timezone to parse dates and Unix timestamps in
timezone_name = "Europe/Helsinki"
# For pretty logging
class bcolors(Enum):
WARNING = '\033[93m'
ENDC = '\033[0m'
# CLI arguments
parser = argparse.ArgumentParser(description="Parses lines of code changed for all commits in current branch to a CSV file")
parser.add_argument('-s', '--start', dest='start_date', type=datetime.fromisoformat,
help='Start date (ISO 8601 -like format, e.g. "2022-06-01 14:00")')
parser.add_argument('-d', '--sum-day', dest='sum_day', action=argparse.BooleanOptionalAction,
help='Sum the commits of each day and remove time from dates (also removes time zones)')
parser.add_argument('--strip-timezone', dest='strip_timezone', action=argparse.BooleanOptionalAction,
help='Strip time zones from the output CSV')
parser.add_argument('-o', '--output', dest='output_path', type=str,
help='Path to output file, e.g. "data.csv"')
args = parser.parse_args()
timez = ZoneInfo(timezone_name)
start_date: datetime = args.start_date.astimezone(timez)
sum_day: bool = args.sum_day
strip_timezone: bool = args.strip_timezone
output_path: str = args.output_path
print(f'Start date: {args.start_date}')
# Get a reference count from rev-list so that we can compare commit count parsed from git log
rev_list_cmd = f"git rev-list --count HEAD '{pathspec}'"
rev_list = subprocess.run(rev_list_cmd, shell=True, check=True, capture_output=True, text=True)
log_cmd = f"git log --stat --format=\"%at\" '{pathspec}'"
logs = subprocess.run(log_cmd,
shell=True, check=True, capture_output=True, text=True)
pattern = re.compile(r'\n(?=\d{10})', re.MULTILINE)
commits = re.split(pattern, logs.stdout)
print(f'Commits: {len(commits)}')
if int(rev_list.stdout) != len(commits):
print(f'{bcolors.WARNING}WARNING: Commit count from `{rev_list_cmd}` and parsed commits from `{log_cmd}` do not match!{bcolors.ENDC}')
timestamp_pattern = r'\d{10,}'
change_pattern = r'\d+ files? changed,( (?P<insertions>\d+) insertions?\(\+\))?,?( (?P<deletions>\d+) deletions?\(\-\))?'
for i, val in enumerate(commits):
timestamp = datetime.fromtimestamp(int(re.match(timestamp_pattern, val)[0]), timez)
match = re.search(change_pattern, val)
insertions = int(match.group('insertions')) if match.group('insertions') else 0
deletions = int(match.group('deletions')) if match.group('deletions') else 0
# print(f'{timestamp.strftime("%Y-%m-%d %H:%M %z")}, insertions: {insertions}, deletions: {deletions}')
commits[i] = [timestamp, insertions, deletions]
if args.start_date:
commits = list(filter(lambda row: row[0] > start_date, commits))
print(f'Filtered commits: {len(commits)}')
if sum_day:
new_commits = {}
for commit in commits:
date = commit[0].date()
new_commits[date] = list(map(add, new_commits.get(date, [0, 0]), commit[1:3]))
commits = [[date, insertions, deletions] for date, (insertions, deletions) in new_commits.items()]
elif strip_timezone:
for commit in commits:
commit[0] = commit[0].replace(tzinfo=None)
header = ['Datetime', 'Insertions', 'Deletions']
with open(output_path, 'w', encoding='UTF8', newline='') as f:
writer = csv.writer(f)
writer.writerow(header)
writer.writerows(commits)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment