Skip to content

Instantly share code, notes, and snippets.

@bbengfort
Last active December 5, 2022 13:12
Show Gist options
  • Save bbengfort/7a7e40930275f1d5633c3c59afc93f5d to your computer and use it in GitHub Desktop.
Save bbengfort/7a7e40930275f1d5633c3c59afc93f5d to your computer and use it in GitHub Desktop.
Iterate through all commits and yield file version information or a time series of file changes.
## Imports
import os
import git
## Module Constants
DATE_TIME_FORMAT = "%Y-%m-%dT%H:%M:%S%z"
EMPTY_TREE_SHA = "4b825dc642cb6eb9a060e54bf8d69288fbee4904"
def versions(path, branch='master'):
"""
This function returns a generator which iterates through all commits of
the repository located in the given path for the given branch. It yields
file diff information to show a timeseries of file changes.
"""
# Create the repository, raises an error if it isn't one.
repo = git.Repo(path)
# Iterate through every commit for the given branch in the repository
for commit in repo.iter_commits(branch):
# Determine the parent of the commit to diff against.
# If no parent, this is the first commit, so use empty tree.
# Then create a mapping of path to diff for each file changed.
parent = commit.parents[0] if commit.parents else EMPTY_TREE_SHA
diffs = {
diff.a_path: diff for diff in commit.diff(parent)
}
# The stats on the commit is a summary of all the changes for this
# commit, we'll iterate through it to get the information we need.
for objpath, stats in commit.stats.files.items():
# Select the diff for the path in the stats
diff = diffs.get(objpath)
# If the path is not in the dictionary, it's because it was
# renamed, so search through the b_paths for the current name.
if not diff:
for diff in diffs.values():
if diff.b_path == path and diff.renamed:
break
# Update the stats with the additional information
stats.update({
'object': os.path.join(path, objpath),
'commit': commit.hexsha,
'author': commit.author.email,
'timestamp': commit.authored_datetime.strftime(DATE_TIME_FORMAT),
'size': diff_size(diff),
'type': diff_type(diff),
})
yield stats
def diff_size(diff):
"""
Computes the size of the diff by comparing the size of the blobs.
"""
if diff.b_blob is None and diff.deleted_file:
# This is a deletion, so return negative the size of the original.
return diff.a_blob.size * -1
if diff.a_blob is None and diff.new_file:
# This is a new file, so return the size of the new value.
return diff.b_blob.size
# Otherwise just return the size a-b
return diff.a_blob.size - diff.b_blob.size
def diff_type(diff):
"""
Determines the type of the diff by looking at the diff flags.
"""
if diff.renamed: return 'R'
if diff.deleted_file: return 'D'
if diff.new_file: return 'A'
return 'M'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment