Last active
April 11, 2022 07:43
-
-
Save kulmajaba/ee7cfdeffaaeccd7e835e349ac2f84c8 to your computer and use it in GitHub Desktop.
Python script for parsing lines changed per commit for a git repository and saving as time-series data in a CSV file
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import argparse | |
import subprocess | |
import re | |
import csv | |
from datetime import datetime | |
from zoneinfo import ZoneInfo | |
from enum import Enum | |
from operator import add | |
# Pattern for paths and files to include, use git pathspec | |
pathspec = '*.tex' | |
# Timezone to parse dates and Unix timestamps in | |
timezone_name = "Europe/Helsinki" | |
# For pretty logging | |
class bcolors(Enum): | |
WARNING = '\033[93m' | |
ENDC = '\033[0m' | |
# CLI arguments | |
parser = argparse.ArgumentParser(description="Parses lines of code changed for all commits in current branch to a CSV file") | |
parser.add_argument('-s', '--start', dest='start_date', type=datetime.fromisoformat, | |
help='Start date (ISO 8601 -like format, e.g. "2022-06-01 14:00")') | |
parser.add_argument('-d', '--sum-day', dest='sum_day', action=argparse.BooleanOptionalAction, | |
help='Sum the commits of each day and remove time from dates (also removes time zones)') | |
parser.add_argument('--strip-timezone', dest='strip_timezone', action=argparse.BooleanOptionalAction, | |
help='Strip time zones from the output CSV') | |
parser.add_argument('-o', '--output', dest='output_path', type=str, | |
help='Path to output file, e.g. "data.csv"') | |
args = parser.parse_args() | |
timez = ZoneInfo(timezone_name) | |
start_date: datetime = args.start_date.astimezone(timez) | |
sum_day: bool = args.sum_day | |
strip_timezone: bool = args.strip_timezone | |
output_path: str = args.output_path | |
print(f'Start date: {args.start_date}') | |
# Get a reference count from rev-list so that we can compare commit count parsed from git log | |
rev_list_cmd = f"git rev-list --count HEAD '{pathspec}'" | |
rev_list = subprocess.run(rev_list_cmd, shell=True, check=True, capture_output=True, text=True) | |
log_cmd = f"git log --stat --format=\"%at\" '{pathspec}'" | |
logs = subprocess.run(log_cmd, | |
shell=True, check=True, capture_output=True, text=True) | |
pattern = re.compile(r'\n(?=\d{10})', re.MULTILINE) | |
commits = re.split(pattern, logs.stdout) | |
print(f'Commits: {len(commits)}') | |
if int(rev_list.stdout) != len(commits): | |
print(f'{bcolors.WARNING}WARNING: Commit count from `{rev_list_cmd}` and parsed commits from `{log_cmd}` do not match!{bcolors.ENDC}') | |
timestamp_pattern = r'\d{10,}' | |
change_pattern = r'\d+ files? changed,( (?P<insertions>\d+) insertions?\(\+\))?,?( (?P<deletions>\d+) deletions?\(\-\))?' | |
for i, val in enumerate(commits): | |
timestamp = datetime.fromtimestamp(int(re.match(timestamp_pattern, val)[0]), timez) | |
match = re.search(change_pattern, val) | |
insertions = int(match.group('insertions')) if match.group('insertions') else 0 | |
deletions = int(match.group('deletions')) if match.group('deletions') else 0 | |
# print(f'{timestamp.strftime("%Y-%m-%d %H:%M %z")}, insertions: {insertions}, deletions: {deletions}') | |
commits[i] = [timestamp, insertions, deletions] | |
if args.start_date: | |
commits = list(filter(lambda row: row[0] > start_date, commits)) | |
print(f'Filtered commits: {len(commits)}') | |
if sum_day: | |
new_commits = {} | |
for commit in commits: | |
date = commit[0].date() | |
new_commits[date] = list(map(add, new_commits.get(date, [0, 0]), commit[1:3])) | |
commits = [[date, insertions, deletions] for date, (insertions, deletions) in new_commits.items()] | |
elif strip_timezone: | |
for commit in commits: | |
commit[0] = commit[0].replace(tzinfo=None) | |
header = ['Datetime', 'Insertions', 'Deletions'] | |
with open(output_path, 'w', encoding='UTF8', newline='') as f: | |
writer = csv.writer(f) | |
writer.writerow(header) | |
writer.writerows(commits) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment