Created
September 9, 2021 17:15
-
-
Save klinki/3a314ab3e7ab680d16b5e7eb256cafbd to your computer and use it in GitHub Desktop.
Git filter repo - remove LFS
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
What to do: | |
1. Get list of tracked files `git lfs track` - look for .gitattributes files | |
2. Prepare replace.txt file based on previous command results | |
3. Fetch all files from LFS `git lfs fetch --all` | |
4. Look into `.git/lfs/objects` and copy all files from all subdirectories into lfs-objects directory | |
git-filter-repo --replace-text replace.txt | |
./remove-lfs.py --relevant 'return filename.endswith(b".JPG") or filename.endswith(b".EXE")' --dir "lfs-objects" | |
Main idea: | |
- Git LFS files are small files (usually 128 bytes) with pointer to LFS object. | |
- All git lfs objects are stored in `.git/lfs/objects` directory (and subdirectories) | |
- Read small files with matching extensions and look for git LFS oid: pattern | |
- If pattern is found, find the matching file and replace its blob in commit | |
Possible optimizations: | |
Automate manual steps: | |
- update .giattributes file | |
- fetch all lfs objects | |
- automatically copy LFS objects to some working directory, or do lookup directly in .git directory (I wouldn't recommend that, separate directory is better, | |
in my script I delete object file after replacing - it is easier to check for forgotten obj files and possibly LFS references) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
""" | |
This is a simple program that will insert some regular file into the root | |
commit(s) of history, e.g. adding a file named LICENSE or COPYING to the | |
first commit. It also rewrites commit hashes in commit messages to update | |
them based on these changes. | |
""" | |
""" | |
Please see the | |
***** API BACKWARD COMPATIBILITY CAVEAT ***** | |
near the top of git-filter-repo. | |
""" | |
# Technically, this program could be replaced by a one-liner: | |
# git filter-repo --force --commit-callback "if not commit.parents: commit.file_changes.append(FileChange(b'M', $RELATIVE_TO_PROJECT_ROOT_PATHNAME, b'$(git hash-object -w $FILENAME)', b'100644'))" | |
# but let's do it as a full-fledged program that imports git_filter_repo | |
# anyway... | |
import re | |
import argparse | |
import os | |
import subprocess | |
try: | |
import git_filter_repo as fr | |
except ImportError: | |
raise SystemExit("Error: Couldn't find git_filter_repo.py. Did you forget to make a symlink to git-filter-repo named git_filter_repo.py or did you forget to put the latter in your PYTHONPATH?") | |
parser = argparse.ArgumentParser( | |
description='Add a file to the root commit(s) of history') | |
parser.add_argument('--relevant', metavar="FUNCTION_BODY", | |
help=("Python code for determining whether to apply linter to a " | |
"given filename. Implies --filenames-important. See CALLBACK " | |
"below.")) | |
parser.add_argument('--dir', type=os.fsencode, | |
help=("Relative-path to file whose contents should be added to root commit(s)")) | |
lint_args = parser.parse_args() | |
if not lint_args.dir: | |
raise SystemExit("Error: Need to specify the --file option") | |
# if not args.file: | |
# raise SystemExit("Error: Need to specify the --file option") | |
# fhash = subprocess.check_output(['git', 'hash-object', '-w', args.file]).strip() | |
# fmode = b'100755' if os.access(args.file, os.X_OK) else b'100644' | |
# FIXME: I've assumed the file wasn't a directory or symlink... | |
if not os.path.isdir(lint_args.dir): | |
raise SystemExit("--dir must be dir") | |
blobs_handled = {} | |
cat_file_process = None | |
def download_file(content): | |
results = re.findall("oid sha256:([a-z0-9]+)", str(content)) | |
if len(results) == 0: | |
return None | |
print(results) | |
tmpDir = lint_args.dir | |
tempFilename = os.path.join(os.path.normpath(tmpDir), os.fsencode(results[0])) | |
print ("Reading " + str(tempFilename)) | |
# Get the new contents | |
with open(tempFilename, "rb") as f: | |
blob = fr.Blob(f.read()) | |
# Insert the new file into the filter's stream, and remove the tempfile | |
filter.insert(blob) | |
os.remove(tempFilename) | |
return blob | |
# def clean_gitattributes(): | |
def fixup_commits(commit, metadata): | |
for change in commit.file_changes: | |
if change.blob_id in blobs_handled: | |
change.blob_id = blobs_handled[change.blob_id] | |
elif change.type == b'D': | |
continue | |
elif not is_relevant(change.filename): | |
continue | |
else: | |
print() | |
print (b"Checking " + change.filename) | |
print(b"Blob id: " + change.blob_id) | |
# Get the old blob contents | |
cat_file_process.stdin.write(change.blob_id + b'\n') | |
cat_file_process.stdin.flush() | |
line = cat_file_process.stdout.readline() | |
splitLine = line.split() | |
print (splitLine) | |
objhash, objtype, objsize = splitLine | |
print ("Size is " + str(objsize)) | |
contents_plus_newline = cat_file_process.stdout.read(int(objsize)+1) | |
if int(objsize) < 120 or int(objsize) > 140: | |
continue | |
if contents_plus_newline is None: | |
continue | |
print(b"Replacing " + change.filename) | |
# Record our handling of the blob and use it for this change | |
blob = download_file(contents_plus_newline) | |
blobs_handled[change.blob_id] = blob.id | |
change.blob_id = blob.id | |
# print(commit.file_changes) | |
# if len(commit.parents) == 0: | |
# commit.file_changes.append(fr.FileChange(b'M', args.file, fhash, fmode)) | |
# FIXME: What if the history already had a file matching the given name, | |
# but which didn't exist until later in history? Is the intent for the | |
# user to keep the other version that existed when it existed, or to | |
# overwrite the version for all of history with the specified file? I | |
# don't know, but if it's the latter, we'd need to add an 'else' clause | |
# like the following: | |
#else: | |
# commit.file_changes = [x for x in commit.file_changes | |
# if x.filename != args.file] | |
lint_args.filenames_important = True | |
if lint_args.filenames_important and not lint_args.relevant: | |
lint_args.relevant = 'return True' | |
if lint_args.relevant: | |
body = lint_args.relevant | |
exec('def is_relevant(filename):\n '+'\n '.join(body.splitlines()), | |
globals()) | |
lint_args.filenames_important = True | |
args = fr.FilteringOptions.default_options() | |
args.force = True | |
if lint_args.filenames_important: | |
# tmpdir = tempfile.mkdtemp().encode() | |
cat_file_process = subprocess.Popen(['git', 'cat-file', '--batch'], | |
stdin = subprocess.PIPE, | |
stdout = subprocess.PIPE) | |
filter = fr.RepoFilter(args, commit_callback=fixup_commits) | |
filter.run() | |
cat_file_process.stdin.close() | |
cat_file_process.wait() | |
else: | |
filter = fr.RepoFilter(args, blob_callback=lint_non_binary_blobs) | |
filter.run() | |
# fr_args = fr.FilteringOptions.parse_args(['--preserve-commit-encoding', | |
# '--force', | |
# '--replace-refs', 'update-no-add']) | |
# filter = fr.RepoFilter(fr_args, commit_callback=fixup_commits) | |
# filter.run() | |
# |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
*.png filter=lfs diff=lfs merge=lfs -text==>*.png -text | |
These 2 might be better: | |
filter=lfs==> | |
diff=lfs==> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment