Created September 9, 2021 17:15
Git filter repo - remove LFS
What to do:
1. Get list of tracked files `git lfs track` - look for .gitattributes files
2. Prepare replace.txt file based on previous command results
3. Fetch all files from LFS `git lfs fetch --all`
4. Look into `.git/lfs/objects` and copy all files from all subdirectories into lfs-objects directory
git-filter-repo --replace-text replace.txt
./ --relevant 'return filename.endswith(b".JPG") or filename.endswith(b".EXE")' --dir "lfs-objects"
Main idea:
- Git LFS files are small files (usually 128 bytes) with pointer to LFS object.
- All git lfs objects are stored in `.git/lfs/objects` directory (and subdirectories)
- Read small files with matching extensions and look for git LFS oid: pattern
- If pattern is found, find the matching file and replace its blob in commit
Possible optimizations:
Automate manual steps:
- update .giattributes file
- fetch all lfs objects
- automatically copy LFS objects to some working directory, or do lookup directly in .git directory (I wouldn't recommend that, separate directory is better,
in my script I delete object file after replacing - it is easier to check for forgotten obj files and possibly LFS references)
#!/usr/bin/env python3
This is a simple program that will insert some regular file into the root
commit(s) of history, e.g. adding a file named LICENSE or COPYING to the
first commit. It also rewrites commit hashes in commit messages to update
them based on these changes.
Please see the
near the top of git-filter-repo.
# Technically, this program could be replaced by a one-liner:
# git filter-repo --force --commit-callback "if not commit.parents: commit.file_changes.append(FileChange(b'M', $RELATIVE_TO_PROJECT_ROOT_PATHNAME, b'$(git hash-object -w $FILENAME)', b'100644'))"
# but let's do it as a full-fledged program that imports git_filter_repo
# anyway...
import re
import argparse
import os
import subprocess
import git_filter_repo as fr
except ImportError:
raise SystemExit("Error: Couldn't find Did you forget to make a symlink to git-filter-repo named or did you forget to put the latter in your PYTHONPATH?")
parser = argparse.ArgumentParser(
description='Add a file to the root commit(s) of history')
parser.add_argument('--relevant', metavar="FUNCTION_BODY",
help=("Python code for determining whether to apply linter to a "
"given filename. Implies --filenames-important. See CALLBACK "
parser.add_argument('--dir', type=os.fsencode,
help=("Relative-path to file whose contents should be added to root commit(s)"))
lint_args = parser.parse_args()
if not lint_args.dir:
raise SystemExit("Error: Need to specify the --file option")
# if not args.file:
# raise SystemExit("Error: Need to specify the --file option")
# fhash = subprocess.check_output(['git', 'hash-object', '-w', args.file]).strip()
# fmode = b'100755' if os.access(args.file, os.X_OK) else b'100644'
# FIXME: I've assumed the file wasn't a directory or symlink...
if not os.path.isdir(lint_args.dir):
raise SystemExit("--dir must be dir")
blobs_handled = {}
cat_file_process = None
def download_file(content):
results = re.findall("oid sha256:([a-z0-9]+)", str(content))
if len(results) == 0:
return None
tmpDir = lint_args.dir
tempFilename = os.path.join(os.path.normpath(tmpDir), os.fsencode(results[0]))
print ("Reading " + str(tempFilename))
# Get the new contents
with open(tempFilename, "rb") as f:
blob = fr.Blob(
# Insert the new file into the filter's stream, and remove the tempfile
return blob
# def clean_gitattributes():
def fixup_commits(commit, metadata):
for change in commit.file_changes:
if change.blob_id in blobs_handled:
change.blob_id = blobs_handled[change.blob_id]
elif change.type == b'D':
elif not is_relevant(change.filename):
print (b"Checking " + change.filename)
print(b"Blob id: " + change.blob_id)
# Get the old blob contents
cat_file_process.stdin.write(change.blob_id + b'\n')
line = cat_file_process.stdout.readline()
splitLine = line.split()
print (splitLine)
objhash, objtype, objsize = splitLine
print ("Size is " + str(objsize))
contents_plus_newline =
if int(objsize) < 120 or int(objsize) > 140:
if contents_plus_newline is None:
print(b"Replacing " + change.filename)
# Record our handling of the blob and use it for this change
blob = download_file(contents_plus_newline)
blobs_handled[change.blob_id] =
change.blob_id =
# print(commit.file_changes)
# if len(commit.parents) == 0:
# commit.file_changes.append(fr.FileChange(b'M', args.file, fhash, fmode))
# FIXME: What if the history already had a file matching the given name,
# but which didn't exist until later in history? Is the intent for the
# user to keep the other version that existed when it existed, or to
# overwrite the version for all of history with the specified file? I
# don't know, but if it's the latter, we'd need to add an 'else' clause
# like the following:
# commit.file_changes = [x for x in commit.file_changes
# if x.filename != args.file]
lint_args.filenames_important = True
if lint_args.filenames_important and not lint_args.relevant:
lint_args.relevant = 'return True'
if lint_args.relevant:
body = lint_args.relevant
exec('def is_relevant(filename):\n '+'\n '.join(body.splitlines()),
lint_args.filenames_important = True
args = fr.FilteringOptions.default_options()
args.force = True
if lint_args.filenames_important:
# tmpdir = tempfile.mkdtemp().encode()
cat_file_process = subprocess.Popen(['git', 'cat-file', '--batch'],
stdin = subprocess.PIPE,
stdout = subprocess.PIPE)
filter = fr.RepoFilter(args, commit_callback=fixup_commits)
filter = fr.RepoFilter(args, blob_callback=lint_non_binary_blobs)
# fr_args = fr.FilteringOptions.parse_args(['--preserve-commit-encoding',
# '--force',
# '--replace-refs', 'update-no-add'])
# filter = fr.RepoFilter(fr_args, commit_callback=fixup_commits)
*.png filter=lfs diff=lfs merge=lfs -text==>*.png -text
These 2 might be better:
