Create a gist now

Instantly share code, notes, and snippets.

Remove large objects from a git repository
#!/usr/bin/python
#
# git-slim
#
# Remove big files from git repo history.
#
# Requires GitPython (https://github.com/gitpython-developers/GitPython)
#
# References:
# - http://help.github.com/remove-sensitive-data/
# - http://stackoverflow.com/questions/4444091/git-filter-branch-to-delete-large-file
# - http://stackoverflow.com/questions/1029969/why-is-my-git-repository-so-big/1036595#1036595
# - http://stackoverflow.com/questions/460331/git-finding-a-filename-from-a-sha1
from glob import glob
from git import Repo
from os.path import getsize
from re import split
from shutil import rmtree
from sys import argv, exit, stdout
def print_activity(start, end='done'):
'''Decorator which logs info like "Doing something: done" to stdout.'''
def decorate(f):
def wrapped(*args, **kwargs):
stdout.write('%s: ' % start)
stdout.flush()
x = f(*args, **kwargs)
print end
return x
return wrapped
return decorate
def slim_main():
'''Invoke slimming on working directory or first argv entry.'''
repo_dir = argv[1] if len(argv) > 1 else '.'
try:
slim(repo_dir)
except KeyboardInterrupt:
exit(0)
def slim(repo_dir):
r = Repo(repo_dir)
prep(r)
old_size = repo_size(r)
slim_blobs(r)
tidy_up(r)
new_size = repo_size(r)
ok_done(old_size, new_size)
def repo_size(r):
return getsize(r.git_dir)
def prep(r):
'''Prep a repo by running GC and repacking.'''
if r.is_dirty():
raise Exception('repo is dirty')
gc(r)
repack(r)
def slim_blobs(r):
'''Reduce repo size by listing blobs in size order and asking the user if
they would like to remove them.
'''
pack_blobs = list_pack_blobs_by_size(r)
index = blob_index(r)
seen = []
targets = []
for b in pack_blobs:
if b[0] not in index:
print '%s not in blob index' % b[0]
else:
blob_path, commit_hexsha = index[b[0]]
if blob_path not in seen:
blob_size = format_size(b[1])
commit_hexsha_prefix = commit_hexsha[:7]
prompt = 'Remove %s (%s at %s)? [Y/n/d] ' % \
(blob_path, blob_size, commit_hexsha_prefix)
answer = raw_input(prompt).strip().lower()
if answer == 'd':
break
elif answer in ('y', ''):
targets.append(blob_path)
seen.append(blob_path)
remove_files(r, targets)
def blob_index(r):
'''Build index of paths of blobs in the repo. Iterates across all files in
all commits and records blob used.
'''
desc = 'Indexing blobs in commits: '
index = {}
commits = list(r.iter_commits())
commits_len = len(commits)
blob_predicate = lambda i, d: i.type == 'blob'
i = 1
for commit in commits:
stdout.write('\r%s(%s/%s)' % (desc, i, commits_len))
stdout.flush()
for blob in commit.tree.traverse(predicate=blob_predicate):
index[blob.hexsha] = blob.path, str(commit)
i += 1
print '\r%sdone ' % desc
return index
@print_activity('Listing pack blobs')
def list_pack_blobs_by_size(r):
blobs = list_pack_blobs(r)
blobs_s = sorted(blobs, key=lambda b: b[1], reverse=True)
return blobs_s
def list_pack_blobs(r):
'''Call git verify-pack to dump info about blobs in a pack.'''
pack_index_glob = r.git_dir + '/objects/pack/pack-*.idx'
pack_index_files = glob(pack_index_glob)
pack_info = r.git.verify_pack(*pack_index_files, verbose=True)
return extract_blob_info(pack_info)
def extract_blob_info(pack_info):
'''Extract info about blobs in a pack from text returned by git verify-pack.
'''
for line in pack_info.split('\n'):
bits = split(r'\s+', line)
if len(bits) > 1 and bits[1] == 'blob':
yield bits[0], int(bits[3])
def format_size(num):
'''Format numbers as file sizes. From hurry.filesize.'''
for x in [' bytes', 'KB', 'MB', 'GB', 'TB']:
if num < 1024.0:
return "%.0f%s" % (num, x)
num /= 1024.0
@print_activity('Removing files from repo history')
def remove_files(r, fs):
'''Run git rm for each file in list against each commit using git
filter-branch. Completely removes files from repo history.
'''
if not fs:
return
# todo: check file list doesn't exceed max command length
filelist = ' '.join(fs)
r.git.filter_branch('--index-filter',
'git rm --cached --ignore-unmatch %s' % filelist,
'--prune-empty',
'HEAD')
def tidy_up(r):
'''Tidy up by expiring reflog, aggresively GCing repo and repacking. Should
recover space used by objects removed during slimming process.
'''
rm_original_refs(r)
expire_reflog(r)
gc(r)
repack(r)
@print_activity('Removing original refs')
def rm_original_refs(r):
rmtree(r.git_dir + '/refs/original/', ignore_errors=True)
@print_activity('Expiring reflog')
def expire_reflog(r):
r.git.reflog('expire', '--expire=now', '--all')
@print_activity('Garbage collecting')
def gc(r):
r.git.gc(prune=True)
@print_activity('Repacking')
def repack(r):
r.git.repack(a=True, d=True, q=True)
def ok_done(old_size, new_size):
delta = format_size(old_size - new_size)
old_f = format_size(old_size)
new_f = format_size(new_size)
print '\nRepo slimmed by %s (reduced from %s to %s)' % (delta, old_f, new_f)
print '(Running \'git gc --agressive --prune\' may reclaim further space)\n'
print 'Next run \'git push origin --all --force\''
print 'Then re-clone all copies of the repo'
print 'Warning: If an old clone is used, big objects may reappear'
if __name__ == '__main__':
slim_main()
@arubin
arubin commented Feb 14, 2012

This works amazingly awesome! I had to run git gc --aggressive --prune to get it to take effect, but after that it was magic!

Mega thanks!

@afternoon
Owner
@dustinschultz

Thanks for the little script. It made this timely process somewhat easier Thanks again.

Dustin

@bimargulies

This exploded for me:

Garbage collecting: done
Repacking: done
Listing pack blobs: done
Traceback (most recent call last):
  File "/Users/benson/bin/git-slim.py", line 208, in <module>
    slim_main()
  File "/Users/benson/bin/git-slim.py", line 40, in slim_main
    slim(repo_dir)
  File "/Users/benson/bin/git-slim.py", line 49, in slim
    slim_blobs(r)
  File "/Users/benson/bin/git-slim.py", line 73, in slim_blobs
    index = blob_index(r)
  File "/Users/benson/bin/git-slim.py", line 102, in blob_index
    commits = list(r.iter_commits())
  File "/Library/Python/2.7/site-packages/git/repo/base.py", line 423, in iter_commits
    rev = self.head.commit
  File "/Library/Python/2.7/site-packages/git/refs/symbolic.py", line 168, in _get_commit
    obj = self._get_object()
  File "/Library/Python/2.7/site-packages/git/refs/symbolic.py", line 161, in _get_object
    return Object.new_from_sha(self.repo, hex_to_bin(self.dereference_recursive(self.repo, self.path)))
  File "/Library/Python/2.7/site-packages/git/objects/base.py", line 64, in new_from_sha
    oinfo = repo.odb.info(sha1)
  File "/Library/Python/2.7/site-packages/gitdb/db/base.py", line 256, in info
    return self._db_query(sha).info(sha)
  File "/Library/Python/2.7/site-packages/gitdb/db/loose.py", line 162, in info
    m = self._map_loose_object(sha)
  File "/Library/Python/2.7/site-packages/gitdb/db/loose.py", line 146, in _map_loose_object
    raise BadObject(sha)
gitdb.exc.BadObject: BadObject: 59a968e4b7bf20039a9314c383a7bb5aa955b53c
@Chronial
Chronial commented Jul 2, 2013

Quick look at the code: Won’t it only run git filter-branch on the current branch instead of the whole repo?

And since this is such a potentially destructive script, you might want to add a notice about that.

@kubark42
kubark42 commented Jun 9, 2014

@bimargulies, this script failed for me as well, with the identical error. For the time being this script probably shouldn't be used, especially since as @Chronial points out it's potentially destructive.

python git-slim.py 
Garbage collecting: done
Repacking: done
Listing pack blobs: done
Traceback (most recent call last):
  File "git-slim.py", line 208, in <module>
    slim_main()
  File "git-slim.py", line 40, in slim_main
    slim(repo_dir)
  File "git-slim.py", line 49, in slim
    slim_blobs(r)
  File "git-slim.py", line 73, in slim_blobs
    index = blob_index(r)
  File "git-slim.py", line 102, in blob_index
    commits = list(r.iter_commits())
  File "/Library/Python/2.7/site-packages/GitPython-0.3.2.RC1-py2.7.egg/git/repo/base.py", line 424, in iter_commits
    rev = self.head.commit
  File "/Library/Python/2.7/site-packages/GitPython-0.3.2.RC1-py2.7.egg/git/refs/symbolic.py", line 168, in _get_commit
    obj = self._get_object()
  File "/Library/Python/2.7/site-packages/GitPython-0.3.2.RC1-py2.7.egg/git/refs/symbolic.py", line 161, in _get_object
    return Object.new_from_sha(self.repo, hex_to_bin(self.dereference_recursive(self.repo, self.path)))
  File "/Library/Python/2.7/site-packages/GitPython-0.3.2.RC1-py2.7.egg/git/objects/base.py", line 64, in new_from_sha
    oinfo = repo.odb.info(sha1)
  File "/Library/Python/2.7/site-packages/gitdb-0.5.4-py2.7-macosx-10.9-intel.egg/gitdb/db/base.py", line 256, in info
    return self._db_query(sha).info(sha)
  File "/Library/Python/2.7/site-packages/gitdb-0.5.4-py2.7-macosx-10.9-intel.egg/gitdb/db/loose.py", line 162, in info
    m = self._map_loose_object(sha)
  File "/Library/Python/2.7/site-packages/gitdb-0.5.4-py2.7-macosx-10.9-intel.egg/gitdb/db/loose.py", line 146, in _map_loose_object
    raise BadObject(sha)
gitdb.exc.BadObject: BadObject: 45bb1fbbb91af857c8566fd30fe59d6dfee0d63d
@gclsoft
gclsoft commented Jul 1, 2014

Failed on OS X 10.10

python git-slim.py 
Traceback (most recent call last):
  File "git-slim.py", line 208, in <module>
    slim_main()
  File "git-slim.py", line 40, in slim_main
    slim(repo_dir)
  File "git-slim.py", line 47, in slim
    prep(r)
  File "git-slim.py", line 61, in prep
    if r.is_dirty():
TypeError: 'bool' object is not callable
@greenspray

I get this error :

File "git-slim.py", line 16, in
from git import Repo
ImportError: No module named git

This means the library is not found. Where can I get the library for this?

@SimonZhang2012

@greenspray, you can google how to install gitpython.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment