Skip to content

Instantly share code, notes, and snippets.

@josephlou5
Last active December 1, 2022 02:23
Show Gist options
  • Save josephlou5/6a2db33b82f9608a435c016106e2144c to your computer and use it in GitHub Desktop.
Save josephlou5/6a2db33b82f9608a435c016106e2144c to your computer and use it in GitHub Desktop.
Gets the history of a specific file in all the commits of a repository
"""
get_file_history.py
Gets the history of a specific file in all the commits of a repo.
GitPython: https://gitpython.readthedocs.io/en/stable/index.html
"""
# ==============================================================================
import json
from pathlib import Path
import git
# ==============================================================================
# Path to the repository folder.
REPO_PATH = '.'
# The file paths to filter by. Only return commits that change any of
# these paths. If empty, returns all commits.
FILE_PATHS = [
'data/',
]
# The number of commits to process. Set to None to process all commits.
NUM_COMMITS = None
DATETIME_FMT = '%Y-%m-%d %H:%M:%S'
# ==============================================================================
PROCESSED_COMMITS = Path('processed_commits.json')
# ==============================================================================
def process_file(commit_sha, commit_datetime, filename, file_contents):
"""Do something with the file, such as save it.
This function will be called for every file in every commit, even if it's
not in `FILE_PATHS`. If needed, filter it out before processing the file.
"""
# ==============================================================================
def main():
processed_commits = json.loads(
PROCESSED_COMMITS.read_text(encoding='utf-8'))
repo = git.Repo(REPO_PATH)
try:
count = 0
for commit in repo.iter_commits(paths=FILE_PATHS):
commit_sha = commit.hexsha
if commit_sha in processed_commits:
# do not re-process this commit
continue
commit_datetime = commit.committed_datetime
commit_files = []
for file in commit.stats.files:
if ' => ' in file:
# renamed file
left, right = file.split(' => ')
if '{' in file and '}' in file:
# only a part was renamed
# remove the '{'
unchanged_left, old_name = left.split('{')
# remove the '}'
new_name, unchanged_right = right.split('}')
new_file = unchanged_left + new_name + unchanged_right
else:
# the entire file was renamed
new_file = right
file = new_file
try:
file_data = commit.tree / file
except KeyError:
# file was deleted (probably)
continue
commit_files.append(file)
file_contents = file_data.data_stream.read()
process_file(commit_sha, commit_datetime, file, file_contents)
processed_commits[commit_sha] = {
'author': commit.author.name,
'email': commit.author.email,
'datetime': commit_datetime.strftime(DATETIME_FMT),
'message': commit.message,
'files': commit_files,
}
count += 1
if NUM_COMMITS is not None and count >= NUM_COMMITS:
break
finally:
PROCESSED_COMMITS.write_text(json.dumps(processed_commits, indent=2),
encoding='utf-8')
if __name__ == '__main__':
main()
@josephlou5
Copy link
Author

Note: If using a local git repository (as I am), you must git pull before running to get updated commit data.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment