Skip to content

Embed URL

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Remove large objects from a git repository
#!/usr/bin/python
#
# git-slim
#
# Remove big files from git repo history.
#
# Requires GitPython (https://github.com/gitpython-developers/GitPython)
#
# References:
# - http://help.github.com/remove-sensitive-data/
# - http://stackoverflow.com/questions/4444091/git-filter-branch-to-delete-large-file
# - http://stackoverflow.com/questions/1029969/why-is-my-git-repository-so-big/1036595#1036595
# - http://stackoverflow.com/questions/460331/git-finding-a-filename-from-a-sha1
from glob import glob
from git import Repo
from os.path import getsize
from re import split
from shutil import rmtree
from sys import argv, exit, stdout
def print_activity(start, end='done'):
'''Decorator which logs info like "Doing something: done" to stdout.'''
def decorate(f):
def wrapped(*args, **kwargs):
stdout.write('%s: ' % start)
stdout.flush()
x = f(*args, **kwargs)
print end
return x
return wrapped
return decorate
def slim_main():
'''Invoke slimming on working directory or first argv entry.'''
repo_dir = argv[1] if len(argv) > 1 else '.'
try:
slim(repo_dir)
except KeyboardInterrupt:
exit(0)
def slim(repo_dir):
r = Repo(repo_dir)
prep(r)
old_size = repo_size(r)
slim_blobs(r)
tidy_up(r)
new_size = repo_size(r)
ok_done(old_size, new_size)
def repo_size(r):
return getsize(r.git_dir)
def prep(r):
'''Prep a repo by running GC and repacking.'''
if r.is_dirty():
raise Exception('repo is dirty')
gc(r)
repack(r)
def slim_blobs(r):
'''Reduce repo size by listing blobs in size order and asking the user if
they would like to remove them.
'''
pack_blobs = list_pack_blobs_by_size(r)
index = blob_index(r)
seen = []
targets = []
for b in pack_blobs:
if b[0] not in index:
print '%s not in blob index' % b[0]
else:
blob_path, commit_hexsha = index[b[0]]
if blob_path not in seen:
blob_size = format_size(b[1])
commit_hexsha_prefix = commit_hexsha[:7]
prompt = 'Remove %s (%s at %s)? [Y/n/d] ' % \
(blob_path, blob_size, commit_hexsha_prefix)
answer = raw_input(prompt).strip().lower()
if answer == 'd':
break
elif answer in ('y', ''):
targets.append(blob_path)
seen.append(blob_path)
remove_files(r, targets)
def blob_index(r):
'''Build index of paths of blobs in the repo. Iterates across all files in
all commits and records blob used.
'''
desc = 'Indexing blobs in commits: '
index = {}
commits = list(r.iter_commits())
commits_len = len(commits)
blob_predicate = lambda i, d: i.type == 'blob'
i = 1
for commit in commits:
stdout.write('\r%s(%s/%s)' % (desc, i, commits_len))
stdout.flush()
for blob in commit.tree.traverse(predicate=blob_predicate):
index[blob.hexsha] = blob.path, str(commit)
i += 1
print '\r%sdone ' % desc
return index
@print_activity('Listing pack blobs')
def list_pack_blobs_by_size(r):
blobs = list_pack_blobs(r)
blobs_s = sorted(blobs, key=lambda b: b[1], reverse=True)
return blobs_s
def list_pack_blobs(r):
'''Call git verify-pack to dump info about blobs in a pack.'''
pack_index_glob = r.git_dir + '/objects/pack/pack-*.idx'
pack_index_files = glob(pack_index_glob)
pack_info = r.git.verify_pack(*pack_index_files, verbose=True)
return extract_blob_info(pack_info)
def extract_blob_info(pack_info):
'''Extract info about blobs in a pack from text returned by git verify-pack.
'''
for line in pack_info.split('\n'):
bits = split(r'\s+', line)
if len(bits) > 1 and bits[1] == 'blob':
yield bits[0], int(bits[3])
def format_size(num):
'''Format numbers as file sizes. From hurry.filesize.'''
for x in [' bytes', 'KB', 'MB', 'GB', 'TB']:
if num < 1024.0:
return "%.0f%s" % (num, x)
num /= 1024.0
@print_activity('Removing files from repo history')
def remove_files(r, fs):
'''Run git rm for each file in list against each commit using git
filter-branch. Completely removes files from repo history.
'''
if not fs:
return
# todo: check file list doesn't exceed max command length
filelist = ' '.join(fs)
r.git.filter_branch('--index-filter',
'git rm --cached --ignore-unmatch %s' % filelist,
'--prune-empty',
'HEAD')
def tidy_up(r):
'''Tidy up by expiring reflog, aggresively GCing repo and repacking. Should
recover space used by objects removed during slimming process.
'''
rm_original_refs(r)
expire_reflog(r)
gc(r)
repack(r)
@print_activity('Removing original refs')
def rm_original_refs(r):
rmtree(r.git_dir + '/refs/original/', ignore_errors=True)
@print_activity('Expiring reflog')
def expire_reflog(r):
r.git.reflog('expire', '--expire=now', '--all')
@print_activity('Garbage collecting')
def gc(r):
r.git.gc(prune=True)
@print_activity('Repacking')
def repack(r):
r.git.repack(a=True, d=True, q=True)
def ok_done(old_size, new_size):
delta = format_size(old_size - new_size)
old_f = format_size(old_size)
new_f = format_size(new_size)
print '\nRepo slimmed by %s (reduced from %s to %s)' % (delta, old_f, new_f)
print '(Running \'git gc --agressive --prune\' may reclaim further space)\n'
print 'Next run \'git push origin --all --force\''
print 'Then re-clone all copies of the repo'
print 'Warning: If an old clone is used, big objects may reappear'
if __name__ == '__main__':
slim_main()
@arubin

This works amazingly awesome! I had to run git gc --aggressive --prune to get it to take effect, but after that it was magic!

Mega thanks!

@afternoon
Owner
@dustinschultz

Thanks for the little script. It made this timely process somewhat easier Thanks again.

Dustin

@bimargulies

This exploded for me:

Garbage collecting: done
Repacking: done
Listing pack blobs: done
Traceback (most recent call last):
  File "/Users/benson/bin/git-slim.py", line 208, in <module>
    slim_main()
  File "/Users/benson/bin/git-slim.py", line 40, in slim_main
    slim(repo_dir)
  File "/Users/benson/bin/git-slim.py", line 49, in slim
    slim_blobs(r)
  File "/Users/benson/bin/git-slim.py", line 73, in slim_blobs
    index = blob_index(r)
  File "/Users/benson/bin/git-slim.py", line 102, in blob_index
    commits = list(r.iter_commits())
  File "/Library/Python/2.7/site-packages/git/repo/base.py", line 423, in iter_commits
    rev = self.head.commit
  File "/Library/Python/2.7/site-packages/git/refs/symbolic.py", line 168, in _get_commit
    obj = self._get_object()
  File "/Library/Python/2.7/site-packages/git/refs/symbolic.py", line 161, in _get_object
    return Object.new_from_sha(self.repo, hex_to_bin(self.dereference_recursive(self.repo, self.path)))
  File "/Library/Python/2.7/site-packages/git/objects/base.py", line 64, in new_from_sha
    oinfo = repo.odb.info(sha1)
  File "/Library/Python/2.7/site-packages/gitdb/db/base.py", line 256, in info
    return self._db_query(sha).info(sha)
  File "/Library/Python/2.7/site-packages/gitdb/db/loose.py", line 162, in info
    m = self._map_loose_object(sha)
  File "/Library/Python/2.7/site-packages/gitdb/db/loose.py", line 146, in _map_loose_object
    raise BadObject(sha)
gitdb.exc.BadObject: BadObject: 59a968e4b7bf20039a9314c383a7bb5aa955b53c
@Chronial

Quick look at the code: Won’t it only run git filter-branch on the current branch instead of the whole repo?

And since this is such a potentially destructive script, you might want to add a notice about that.

@kubark42

@bimargulies, this script failed for me as well, with the identical error. For the time being this script probably shouldn't be used, especially since as @Chronial points out it's potentially destructive.

python git-slim.py 
Garbage collecting: done
Repacking: done
Listing pack blobs: done
Traceback (most recent call last):
  File "git-slim.py", line 208, in <module>
    slim_main()
  File "git-slim.py", line 40, in slim_main
    slim(repo_dir)
  File "git-slim.py", line 49, in slim
    slim_blobs(r)
  File "git-slim.py", line 73, in slim_blobs
    index = blob_index(r)
  File "git-slim.py", line 102, in blob_index
    commits = list(r.iter_commits())
  File "/Library/Python/2.7/site-packages/GitPython-0.3.2.RC1-py2.7.egg/git/repo/base.py", line 424, in iter_commits
    rev = self.head.commit
  File "/Library/Python/2.7/site-packages/GitPython-0.3.2.RC1-py2.7.egg/git/refs/symbolic.py", line 168, in _get_commit
    obj = self._get_object()
  File "/Library/Python/2.7/site-packages/GitPython-0.3.2.RC1-py2.7.egg/git/refs/symbolic.py", line 161, in _get_object
    return Object.new_from_sha(self.repo, hex_to_bin(self.dereference_recursive(self.repo, self.path)))
  File "/Library/Python/2.7/site-packages/GitPython-0.3.2.RC1-py2.7.egg/git/objects/base.py", line 64, in new_from_sha
    oinfo = repo.odb.info(sha1)
  File "/Library/Python/2.7/site-packages/gitdb-0.5.4-py2.7-macosx-10.9-intel.egg/gitdb/db/base.py", line 256, in info
    return self._db_query(sha).info(sha)
  File "/Library/Python/2.7/site-packages/gitdb-0.5.4-py2.7-macosx-10.9-intel.egg/gitdb/db/loose.py", line 162, in info
    m = self._map_loose_object(sha)
  File "/Library/Python/2.7/site-packages/gitdb-0.5.4-py2.7-macosx-10.9-intel.egg/gitdb/db/loose.py", line 146, in _map_loose_object
    raise BadObject(sha)
gitdb.exc.BadObject: BadObject: 45bb1fbbb91af857c8566fd30fe59d6dfee0d63d
@gclsoft

Failed on OS X 10.10

python git-slim.py 
Traceback (most recent call last):
  File "git-slim.py", line 208, in <module>
    slim_main()
  File "git-slim.py", line 40, in slim_main
    slim(repo_dir)
  File "git-slim.py", line 47, in slim
    prep(r)
  File "git-slim.py", line 61, in prep
    if r.is_dirty():
TypeError: 'bool' object is not callable
@greenspray

I get this error :

File "git-slim.py", line 16, in
from git import Repo
ImportError: No module named git

This means the library is not found. Where can I get the library for this?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Something went wrong with that request. Please try again.