Skip to content

Instantly share code, notes, and snippets.

@lstrojny
Last active June 30, 2022 17:19
Show Gist options
  • Save lstrojny/6d29aea45179668725f43650fa46c4e7 to your computer and use it in GitHub Desktop.
Save lstrojny/6d29aea45179668725f43650fa46c4e7 to your computer and use it in GitHub Desktop.
Use git-filter-repo to import lfs objects
import os
from os import path
import shutil
import subprocess
from functools import lru_cache
import re
from fnmatch import translate
from hashlib import sha256
from tempfile import NamedTemporaryFile
from collections.abc import Iterable
from vendor import git_filter_repo as fr
def chunked(size: int, chunk_size: int) -> Iterable[int]:
remaining = size
while remaining > 0:
yield min(remaining, chunk_size)
remaining -= chunk_size
def git(commands: list[str], dir: str = None) -> None:
args = ['git']
if dir is not None:
args = [*args, '-C', dir]
args = [*args, *commands]
run_command(args, env={'GIT_LFS_SKIP_SMUDGE': '1'})
def run_command(args: list[str], env=None, cwd=None) -> None:
env = env or {}
subprocess.run(args, check=True, env={**os.environ, **env}, cwd=cwd)
def lfs_import(source_dir, target_dir) -> None:
git(['init', '-b', 'prod', target_dir])
cwd = os.getcwd()
try:
os.chdir(target_dir)
glob_expressions = []
with open(path.join(source_dir, '.gitattributes')) as gitattributes:
for line in gitattributes.readlines():
if 'filter=lfs' in line:
glob_expressions.append(line.split(' ')[0])
glob_expressions = tuple(glob_expressions)
options = [
'--source', source_dir,
'--target', target_dir,
'--quiet',
'--refs', 'prod'
]
cat_file_proc = subprocess.Popen(['git', '-C', source_dir, 'cat-file', '--batch'],
stdin=subprocess.PIPE,
stdout=subprocess.PIPE)
lfs_object_content = """version https://git-lfs.github.com/spec/v1
oid sha256:{}
size {}
"""
migrated_lfs_blobs = {}
lfs_object_path = '.git/lfs/objects'
def migrate_lfs_commits(commit: fr.Commit, metadata: dict[str, any]):
change: fr.FileChange
for change in commit.file_changes:
if change.type == b'D':
continue # deleted files have no remaining content to filter
if change.mode in (b'120000', b'160000'):
continue # symlinks and submodules aren't text files we can filter
if change.blob_id in migrated_lfs_blobs:
change.blob_id = migrated_lfs_blobs[change.blob_id]
continue
if match_expressions(change.filename.decode(), glob_expressions):
orig_rev = fr.ID_TO_HASH[change.blob_id]
cat_file_proc.stdin.write(orig_rev + b'\n')
cat_file_proc.stdin.flush()
objhash, objtype, objsize = cat_file_proc.stdout.readline().split()
remaining = int(objsize)
checksum = sha256()
migrate = True
tmp = NamedTemporaryFile()
position = 0
for chunk in chunked(int(objsize), 2 ** 13):
bytes_chunk = cat_file_proc.stdout.read(chunk)
# Maybe it already is an LFS object
if position == 0 and bytes_chunk.startswith(b'version https://git-lfs.github.com/spec/v1'):
migrate = False
position += 1
checksum.update(bytes_chunk)
tmp.write(bytes_chunk)
assert b"\n" == cat_file_proc.stdout.read(1)
if not migrate:
tmp.seek(0)
for lfs_line in tmp.readlines():
if lfs_line.startswith(b'oid sha256'):
sha256_checksum = lfs_line.strip()[11:].decode()
sub_folder = path.join(lfs_object_path, sha256_checksum[0:2], sha256_checksum[2:4])
os.makedirs(path.join(target_dir, sub_folder), exist_ok=True)
source = path.join(source_dir, sub_folder, sha256_checksum)
shutil.copy(source, path.join(target_dir, sub_folder, sha256_checksum))
print('LFS {}: "{}" preserved'.format(change.blob_id, change.filename.decode()))
break
continue
tmp.flush()
sha256_checksum = checksum.hexdigest()
content = lfs_object_content.format(sha256_checksum, objsize.decode()).encode()
lfs_blob = fr.Blob(content)
filter.insert(lfs_blob)
migrated_lfs_blobs[change.blob_id] = lfs_blob.id
change.blob_id = lfs_blob.id
sub_folder = path.join(target_dir, lfs_object_path, sha256_checksum[0:2], sha256_checksum[2:4])
os.makedirs(sub_folder, exist_ok=True)
shutil.copy(tmp.name, path.join(sub_folder, sha256_checksum))
print('LFS {}: "{}" imported ({}, {}, {}, {})'.format(change.blob_id, change.filename.decode(),
sha256_checksum, orig_rev, objsize,
remaining))
filter = fr.RepoFilter(fr.FilteringOptions.parse_args(options), commit_callback=migrate_lfs_commits)
filter.run()
finally:
os.chdir(cwd)
@lru_cache
def compile_expressions(expressions: tuple[str]):
regexes = [translate(expression) for expression in expressions]
return re.compile('({})'.format('|'.join(regexes))).match
@lru_cache
def match_expressions(name: str, expressions: tuple[str]) -> bool:
match = compile_expressions(tuple(expressions))
# This implementation is not fully compliant with gitattributes but good enough for my use case
previous_elements = []
for element in name.split('/')[::-1]:
if match(path.join(element, *previous_elements)):
return True
previous_elements = [element, *previous_elements]
return False
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment