Skip to content

Instantly share code, notes, and snippets.

@jwbowers
Created April 18, 2024 09:55
Show Gist options
  • Save jwbowers/48881df7df9bf33bca029db8fe9c9bc6 to your computer and use it in GitHub Desktop.
Save jwbowers/48881df7df9bf33bca029db8fe9c9bc6 to your computer and use it in GitHub Desktop.
List the percentage change in text between each commit and the previous commit. Can be more than 100.
import subprocess
import sys
import csv
# uses the GitPython module
import git
from pathlib import Path
## Notice need to install GitPython
## Recommended:
## python3 -m venv .venv
## source .venv/bin/activate
## pip3 install GitPython
## To use on the file causal_inference.Rmd at the unix/mac command line:
## python3 commit_sizes.py causal-inference/causal-inference.Rmd
## basically we just need to give it the path to a file
file_path = sys.argv[1]
def get_commits_for_file(file_path):
try:
repo = git.Repo(".",search_parent_directories=True) # Open the Git repository
commits = list(repo.iter_commits(paths=file_path,reverse=True)) # Efficiently iterate using paths argument
commit_shas = []
for commit in commits:
commit_shas.append(commit.hexsha[:7]) # Capture the first 7 characters of the SHA
return commit_shas
except git.exc.NoSuchPathError:
print(f"Error: File '{file_path}' not found in repository.")
return [] # Return an empty list on error
#file_path = "causal-inference/causal-inference.Rmd"
#commits = get_commits_for_file(file_path)
##
##if commits:
## print("Commits for", file_path, ":")
## for commit_sha in commits:
## print(commit_sha)
##else:
## print("No commits found for", file_path)
def get_commit_diff(commit_hash, file_path):
result = subprocess.run(
['git', 'diff', commit_hash, commit_hash + '^', file_path],
capture_output=True,
text=True
)
return result.stdout.split('\n')
def count_words(diff_lines):
added_words = 0
deleted_words = 0
for line in diff_lines:
if line.startswith('+'):
added_words += len(line[1:].split())
elif line.startswith('-'):
deleted_words += len(line[1:].split())
return added_words, deleted_words
def calculate_percentage_change(added_words, deleted_words, total_words):
return (added_words + deleted_words) / total_words * 100
def main():
#file_path = 'causal-inference/causal-inference.Rmd'
repo = git.Repo(".",search_parent_directories=True) # Open the Git repository
commits = get_commits_for_file(file_path)
with open('output.csv', 'w', newline='') as csvfile:
fieldnames = ['commit', 'percentage_change']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for commit in commits:
if not commit:
continue
if commit == commits[0]:
percentage_change = 100
else:
the_commit_obj = repo.commit(commit)
git_blob = the_commit_obj.tree / file_path
file_content = git_blob.data_stream.read().decode('utf-8')
total_words = len(file_content.split())
diff_lines = get_commit_diff(commit, file_path)
added_words, deleted_words = count_words(diff_lines)
percentage_change = calculate_percentage_change(added_words, deleted_words, total_words)
writer.writerow({'commit': commit, 'percentage_change': percentage_change})
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment