Created
July 4, 2022 15:36
-
-
Save rconde01/ab93a0edddc5b0abf64ad4c8ac5b6ade to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
from os import path | |
import subprocess | |
from hashlib import sha256 | |
from collections.abc import Iterable | |
import git_filter_repo as fr | |
def chunked(size: int, chunk_size: int) -> Iterable[int]: | |
remaining = size | |
while remaining > 0: | |
yield min(remaining, chunk_size) | |
remaining -= chunk_size | |
def git(commands: list[str], dir: str = None) -> None: | |
args = ['git'] | |
if dir is not None: | |
args = [*args, '-C', dir] | |
args = [*args, *commands] | |
run_command(args, env={'GIT_LFS_SKIP_SMUDGE': '1'}) | |
def run_command(args: list[str], env=None, cwd=None) -> None: | |
env = env or {} | |
subprocess.run(args, check=True, env={**os.environ, **env}, cwd=cwd) | |
edited_blobs = {} | |
migrated_files = [] | |
last_git_attributes_contents = None | |
last_git_attributes_hash = None | |
def lfs_import(size_threshold) -> None: | |
unpacked_sizes = {} | |
if size_threshold != 0: | |
unpacked_sizes, _ = fr.GitUtils.get_blob_sizes() | |
lfs_object_content = """version https://git-lfs.github.com/spec/v1 | |
oid sha256:{} | |
size {} | |
""" | |
lfs_object_path = '.git/lfs/objects' | |
def migrate_lfs_commits(commit: fr.Commit, metadata: dict[str, any]): | |
global migrated_lfs_blobs | |
global migrated_files | |
global last_git_attributes_contents | |
global last_git_attributes_hash | |
global edited_attributes | |
new_files_to_track = {} | |
change: fr.FileChange | |
attributes_commit_change: fr.FileChange = None | |
for change in commit.file_changes: | |
if change.type == b'D': | |
continue # deleted files have no remaining content to filter | |
if change.mode in (b'120000', b'160000'): | |
continue # symlinks and submodules aren't text files we can filter | |
if change.blob_id in edited_blobs: | |
change.blob_id = edited_blobs[change.blob_id] | |
continue | |
filename = change.filename.decode() | |
if filename == ".gitattributes": | |
last_git_attributes_contents = subprocess.check_output( | |
['git', 'cat-file', 'blob', change.blob_id.decode()]).decode() | |
last_git_attributes_hash = change.blob_id | |
attributes_commit_change = change | |
continue | |
# Once we migrate a path...we should always migrate it | |
path_already_migrated = filename in migrated_files | |
over_size_threshold = size_threshold != 0 and unpacked_sizes[ | |
change.blob_id] > size_threshold | |
if path_already_migrated or over_size_threshold: | |
orig_rev = fr.ID_TO_HASH[change.blob_id] if change.blob_id in fr.ID_TO_HASH else change.blob_id | |
cat_file_proc = subprocess.Popen(['git', '-C', os.getcwd(), 'cat-file', '--batch'], | |
stdin=subprocess.PIPE, | |
stdout=subprocess.PIPE) | |
cat_file_proc.stdin.write(orig_rev + b'\n') | |
cat_file_proc.stdin.flush() | |
_, _, objsize = cat_file_proc.stdout.readline().split() | |
checksum = sha256() | |
migrate = True | |
lfs_data = bytearray() | |
position = 0 | |
for chunk in chunked(int(objsize), 2 ** 13): | |
bytes_chunk = cat_file_proc.stdout.read(chunk) | |
# Maybe it already is an LFS object | |
if position == 0 and bytes_chunk.startswith(b'version https://git-lfs.github.com/spec/v1'): | |
migrate = False | |
position += 1 | |
checksum.update(bytes_chunk) | |
lfs_data.extend(bytes_chunk) | |
assert b"\n" == cat_file_proc.stdout.read(1) | |
cat_file_proc.kill() | |
if not migrate: | |
continue | |
migrated_files.append(filename) | |
# update the file to be a LFS pointer | |
sha256_checksum = checksum.hexdigest() | |
content = lfs_object_content.format( | |
sha256_checksum, objsize.decode()).encode() | |
lfs_blob = fr.Blob(content) | |
filter.insert(lfs_blob) | |
edited_blobs[change.blob_id] = lfs_blob.id | |
change.blob_id = lfs_blob.id | |
sub_folder = path.join( | |
os.getcwd(), lfs_object_path, sha256_checksum[0:2], sha256_checksum[2:4]) | |
os.makedirs(sub_folder, exist_ok=True) | |
with open(path.join(sub_folder, sha256_checksum), 'wb') as f: | |
f.write(bytes(lfs_data)) | |
print('\nLFS {}: "{}"'.format(change.blob_id, filename)) | |
new_files_to_track[change.blob_id] = filename | |
# update the .gitattributes file | |
if len(new_files_to_track) != 0 or not attributes_commit_change is None: | |
new_attributes_contents = "" if last_git_attributes_contents is None else last_git_attributes_contents | |
if new_attributes_contents != "": | |
new_attributes_contents += '\n' | |
for f in migrated_files: | |
escaped_filename = f.replace(" ", "[[:space:]]") | |
new_attributes_contents += f'{escaped_filename} filter=lfs diff=lfs merge=lfs -text\n' | |
if not attributes_commit_change is None: | |
blob = fr.Blob(new_attributes_contents.encode()) | |
filter.insert(blob) | |
edited_blobs[attributes_commit_change.blob_id] = blob.id | |
attributes_commit_change.blob_id = blob.id | |
print('\nModified change to .gitattributes to include LFS files.') | |
else: | |
# TODO - handle case where attributes changed in same commit as LFS file added | |
blob = fr.Blob(new_attributes_contents.encode()) | |
filter.insert(blob) | |
if last_git_attributes_hash != None: | |
edited_blobs[last_git_attributes_hash] = blob.id | |
commit.file_changes.append(fr.FileChange( | |
b'M', ".gitattributes".encode(), blob.id, b'100644')) | |
print('\nAdded change to .gitattributes to track additional LFS files.') | |
filter = fr.RepoFilter(fr.FilteringOptions.parse_args( | |
["--force", "--replace-refs", "delete-no-add"]), commit_callback=migrate_lfs_commits) | |
filter.run() | |
if __name__ == "__main__": | |
lfs_import(10*1024*1024) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment