Created
April 18, 2024 09:55
-
-
Save jwbowers/48881df7df9bf33bca029db8fe9c9bc6 to your computer and use it in GitHub Desktop.
List the percentage change in text between each commit and the previous commit. Can be more than 100.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import subprocess | |
import sys | |
import csv | |
# uses the GitPython module | |
import git | |
from pathlib import Path | |
## Notice need to install GitPython | |
## Recommended: | |
## python3 -m venv .venv | |
## source .venv/bin/activate | |
## pip3 install GitPython | |
## To use on the file causal_inference.Rmd at the unix/mac command line: | |
## python3 commit_sizes.py causal-inference/causal-inference.Rmd | |
## basically we just need to give it the path to a file | |
file_path = sys.argv[1] | |
def get_commits_for_file(file_path): | |
try: | |
repo = git.Repo(".",search_parent_directories=True) # Open the Git repository | |
commits = list(repo.iter_commits(paths=file_path,reverse=True)) # Efficiently iterate using paths argument | |
commit_shas = [] | |
for commit in commits: | |
commit_shas.append(commit.hexsha[:7]) # Capture the first 7 characters of the SHA | |
return commit_shas | |
except git.exc.NoSuchPathError: | |
print(f"Error: File '{file_path}' not found in repository.") | |
return [] # Return an empty list on error | |
#file_path = "causal-inference/causal-inference.Rmd" | |
#commits = get_commits_for_file(file_path) | |
## | |
##if commits: | |
## print("Commits for", file_path, ":") | |
## for commit_sha in commits: | |
## print(commit_sha) | |
##else: | |
## print("No commits found for", file_path) | |
def get_commit_diff(commit_hash, file_path): | |
result = subprocess.run( | |
['git', 'diff', commit_hash, commit_hash + '^', file_path], | |
capture_output=True, | |
text=True | |
) | |
return result.stdout.split('\n') | |
def count_words(diff_lines): | |
added_words = 0 | |
deleted_words = 0 | |
for line in diff_lines: | |
if line.startswith('+'): | |
added_words += len(line[1:].split()) | |
elif line.startswith('-'): | |
deleted_words += len(line[1:].split()) | |
return added_words, deleted_words | |
def calculate_percentage_change(added_words, deleted_words, total_words): | |
return (added_words + deleted_words) / total_words * 100 | |
def main(): | |
#file_path = 'causal-inference/causal-inference.Rmd' | |
repo = git.Repo(".",search_parent_directories=True) # Open the Git repository | |
commits = get_commits_for_file(file_path) | |
with open('output.csv', 'w', newline='') as csvfile: | |
fieldnames = ['commit', 'percentage_change'] | |
writer = csv.DictWriter(csvfile, fieldnames=fieldnames) | |
writer.writeheader() | |
for commit in commits: | |
if not commit: | |
continue | |
if commit == commits[0]: | |
percentage_change = 100 | |
else: | |
the_commit_obj = repo.commit(commit) | |
git_blob = the_commit_obj.tree / file_path | |
file_content = git_blob.data_stream.read().decode('utf-8') | |
total_words = len(file_content.split()) | |
diff_lines = get_commit_diff(commit, file_path) | |
added_words, deleted_words = count_words(diff_lines) | |
percentage_change = calculate_percentage_change(added_words, deleted_words, total_words) | |
writer.writerow({'commit': commit, 'percentage_change': percentage_change}) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment