Skip to content

Instantly share code, notes, and snippets.

@rconde01
Created July 4, 2022 15:36
Show Gist options
  • Save rconde01/ab93a0edddc5b0abf64ad4c8ac5b6ade to your computer and use it in GitHub Desktop.
Save rconde01/ab93a0edddc5b0abf64ad4c8ac5b6ade to your computer and use it in GitHub Desktop.
import os
from os import path
import subprocess
from hashlib import sha256
from collections.abc import Iterable
import git_filter_repo as fr
def chunked(size: int, chunk_size: int) -> Iterable[int]:
remaining = size
while remaining > 0:
yield min(remaining, chunk_size)
remaining -= chunk_size
def git(commands: list[str], dir: str = None) -> None:
args = ['git']
if dir is not None:
args = [*args, '-C', dir]
args = [*args, *commands]
run_command(args, env={'GIT_LFS_SKIP_SMUDGE': '1'})
def run_command(args: list[str], env=None, cwd=None) -> None:
env = env or {}
subprocess.run(args, check=True, env={**os.environ, **env}, cwd=cwd)
edited_blobs = {}
migrated_files = []
last_git_attributes_contents = None
last_git_attributes_hash = None
def lfs_import(size_threshold) -> None:
unpacked_sizes = {}
if size_threshold != 0:
unpacked_sizes, _ = fr.GitUtils.get_blob_sizes()
lfs_object_content = """version https://git-lfs.github.com/spec/v1
oid sha256:{}
size {}
"""
lfs_object_path = '.git/lfs/objects'
def migrate_lfs_commits(commit: fr.Commit, metadata: dict[str, any]):
global migrated_lfs_blobs
global migrated_files
global last_git_attributes_contents
global last_git_attributes_hash
global edited_attributes
new_files_to_track = {}
change: fr.FileChange
attributes_commit_change: fr.FileChange = None
for change in commit.file_changes:
if change.type == b'D':
continue # deleted files have no remaining content to filter
if change.mode in (b'120000', b'160000'):
continue # symlinks and submodules aren't text files we can filter
if change.blob_id in edited_blobs:
change.blob_id = edited_blobs[change.blob_id]
continue
filename = change.filename.decode()
if filename == ".gitattributes":
last_git_attributes_contents = subprocess.check_output(
['git', 'cat-file', 'blob', change.blob_id.decode()]).decode()
last_git_attributes_hash = change.blob_id
attributes_commit_change = change
continue
# Once we migrate a path...we should always migrate it
path_already_migrated = filename in migrated_files
over_size_threshold = size_threshold != 0 and unpacked_sizes[
change.blob_id] > size_threshold
if path_already_migrated or over_size_threshold:
orig_rev = fr.ID_TO_HASH[change.blob_id] if change.blob_id in fr.ID_TO_HASH else change.blob_id
cat_file_proc = subprocess.Popen(['git', '-C', os.getcwd(), 'cat-file', '--batch'],
stdin=subprocess.PIPE,
stdout=subprocess.PIPE)
cat_file_proc.stdin.write(orig_rev + b'\n')
cat_file_proc.stdin.flush()
_, _, objsize = cat_file_proc.stdout.readline().split()
checksum = sha256()
migrate = True
lfs_data = bytearray()
position = 0
for chunk in chunked(int(objsize), 2 ** 13):
bytes_chunk = cat_file_proc.stdout.read(chunk)
# Maybe it already is an LFS object
if position == 0 and bytes_chunk.startswith(b'version https://git-lfs.github.com/spec/v1'):
migrate = False
position += 1
checksum.update(bytes_chunk)
lfs_data.extend(bytes_chunk)
assert b"\n" == cat_file_proc.stdout.read(1)
cat_file_proc.kill()
if not migrate:
continue
migrated_files.append(filename)
# update the file to be a LFS pointer
sha256_checksum = checksum.hexdigest()
content = lfs_object_content.format(
sha256_checksum, objsize.decode()).encode()
lfs_blob = fr.Blob(content)
filter.insert(lfs_blob)
edited_blobs[change.blob_id] = lfs_blob.id
change.blob_id = lfs_blob.id
sub_folder = path.join(
os.getcwd(), lfs_object_path, sha256_checksum[0:2], sha256_checksum[2:4])
os.makedirs(sub_folder, exist_ok=True)
with open(path.join(sub_folder, sha256_checksum), 'wb') as f:
f.write(bytes(lfs_data))
print('\nLFS {}: "{}"'.format(change.blob_id, filename))
new_files_to_track[change.blob_id] = filename
# update the .gitattributes file
if len(new_files_to_track) != 0 or not attributes_commit_change is None:
new_attributes_contents = "" if last_git_attributes_contents is None else last_git_attributes_contents
if new_attributes_contents != "":
new_attributes_contents += '\n'
for f in migrated_files:
escaped_filename = f.replace(" ", "[[:space:]]")
new_attributes_contents += f'{escaped_filename} filter=lfs diff=lfs merge=lfs -text\n'
if not attributes_commit_change is None:
blob = fr.Blob(new_attributes_contents.encode())
filter.insert(blob)
edited_blobs[attributes_commit_change.blob_id] = blob.id
attributes_commit_change.blob_id = blob.id
print('\nModified change to .gitattributes to include LFS files.')
else:
# TODO - handle case where attributes changed in same commit as LFS file added
blob = fr.Blob(new_attributes_contents.encode())
filter.insert(blob)
if last_git_attributes_hash != None:
edited_blobs[last_git_attributes_hash] = blob.id
commit.file_changes.append(fr.FileChange(
b'M', ".gitattributes".encode(), blob.id, b'100644'))
print('\nAdded change to .gitattributes to track additional LFS files.')
filter = fr.RepoFilter(fr.FilteringOptions.parse_args(
["--force", "--replace-refs", "delete-no-add"]), commit_callback=migrate_lfs_commits)
filter.run()
if __name__ == "__main__":
lfs_import(10*1024*1024)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment