public
Last active

Remove large objects from a git repository

  • Download Gist
git-slim.py
Python
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208
#!/usr/bin/python
#
# git-slim
#
# Remove big files from git repo history.
#
# Requires GitPython (https://github.com/gitpython-developers/GitPython)
#
# References:
# - http://help.github.com/remove-sensitive-data/
# - http://stackoverflow.com/questions/4444091/git-filter-branch-to-delete-large-file
# - http://stackoverflow.com/questions/1029969/why-is-my-git-repository-so-big/1036595#1036595
# - http://stackoverflow.com/questions/460331/git-finding-a-filename-from-a-sha1
 
from glob import glob
from git import Repo
from os.path import getsize
from re import split
from shutil import rmtree
from sys import argv, exit, stdout
 
 
def print_activity(start, end='done'):
'''Decorator which logs info like "Doing something: done" to stdout.'''
def decorate(f):
def wrapped(*args, **kwargs):
stdout.write('%s: ' % start)
stdout.flush()
x = f(*args, **kwargs)
print end
return x
return wrapped
return decorate
 
 
def slim_main():
'''Invoke slimming on working directory or first argv entry.'''
repo_dir = argv[1] if len(argv) > 1 else '.'
try:
slim(repo_dir)
except KeyboardInterrupt:
exit(0)
 
 
def slim(repo_dir):
r = Repo(repo_dir)
prep(r)
old_size = repo_size(r)
slim_blobs(r)
tidy_up(r)
new_size = repo_size(r)
ok_done(old_size, new_size)
 
 
def repo_size(r):
return getsize(r.git_dir)
 
 
def prep(r):
'''Prep a repo by running GC and repacking.'''
if r.is_dirty():
raise Exception('repo is dirty')
gc(r)
repack(r)
 
 
def slim_blobs(r):
'''Reduce repo size by listing blobs in size order and asking the user if
they would like to remove them.
 
'''
pack_blobs = list_pack_blobs_by_size(r)
index = blob_index(r)
seen = []
targets = []
for b in pack_blobs:
if b[0] not in index:
print '%s not in blob index' % b[0]
else:
blob_path, commit_hexsha = index[b[0]]
if blob_path not in seen:
blob_size = format_size(b[1])
commit_hexsha_prefix = commit_hexsha[:7]
prompt = 'Remove %s (%s at %s)? [Y/n/d] ' % \
(blob_path, blob_size, commit_hexsha_prefix)
answer = raw_input(prompt).strip().lower()
if answer == 'd':
break
elif answer in ('y', ''):
targets.append(blob_path)
seen.append(blob_path)
remove_files(r, targets)
 
 
def blob_index(r):
'''Build index of paths of blobs in the repo. Iterates across all files in
all commits and records blob used.
 
'''
desc = 'Indexing blobs in commits: '
index = {}
commits = list(r.iter_commits())
commits_len = len(commits)
blob_predicate = lambda i, d: i.type == 'blob'
i = 1
for commit in commits:
stdout.write('\r%s(%s/%s)' % (desc, i, commits_len))
stdout.flush()
for blob in commit.tree.traverse(predicate=blob_predicate):
index[blob.hexsha] = blob.path, str(commit)
i += 1
print '\r%sdone ' % desc
return index
 
 
@print_activity('Listing pack blobs')
def list_pack_blobs_by_size(r):
blobs = list_pack_blobs(r)
blobs_s = sorted(blobs, key=lambda b: b[1], reverse=True)
return blobs_s
 
 
def list_pack_blobs(r):
'''Call git verify-pack to dump info about blobs in a pack.'''
pack_index_glob = r.git_dir + '/objects/pack/pack-*.idx'
pack_index_files = glob(pack_index_glob)
pack_info = r.git.verify_pack(*pack_index_files, verbose=True)
return extract_blob_info(pack_info)
 
 
def extract_blob_info(pack_info):
'''Extract info about blobs in a pack from text returned by git verify-pack.
 
'''
for line in pack_info.split('\n'):
bits = split(r'\s+', line)
if len(bits) > 1 and bits[1] == 'blob':
yield bits[0], int(bits[3])
 
 
def format_size(num):
'''Format numbers as file sizes. From hurry.filesize.'''
for x in [' bytes', 'KB', 'MB', 'GB', 'TB']:
if num < 1024.0:
return "%.0f%s" % (num, x)
num /= 1024.0
 
 
@print_activity('Removing files from repo history')
def remove_files(r, fs):
'''Run git rm for each file in list against each commit using git
filter-branch. Completely removes files from repo history.
 
'''
if not fs:
return
# todo: check file list doesn't exceed max command length
filelist = ' '.join(fs)
r.git.filter_branch('--index-filter',
'git rm --cached --ignore-unmatch %s' % filelist,
'--prune-empty',
'HEAD')
 
 
def tidy_up(r):
'''Tidy up by expiring reflog, aggresively GCing repo and repacking. Should
recover space used by objects removed during slimming process.
 
'''
rm_original_refs(r)
expire_reflog(r)
gc(r)
repack(r)
 
 
@print_activity('Removing original refs')
def rm_original_refs(r):
rmtree(r.git_dir + '/refs/original/', ignore_errors=True)
 
 
@print_activity('Expiring reflog')
def expire_reflog(r):
r.git.reflog('expire', '--expire=now', '--all')
 
 
@print_activity('Garbage collecting')
def gc(r):
r.git.gc(prune=True)
 
 
@print_activity('Repacking')
def repack(r):
r.git.repack(a=True, d=True, q=True)
 
 
def ok_done(old_size, new_size):
delta = format_size(old_size - new_size)
old_f = format_size(old_size)
new_f = format_size(new_size)
print '\nRepo slimmed by %s (reduced from %s to %s)' % (delta, old_f, new_f)
print '(Running \'git gc --agressive --prune\' may reclaim further space)\n'
print 'Next run \'git push origin --all --force\''
print 'Then re-clone all copies of the repo'
print 'Warning: If an old clone is used, big objects may reappear'
 
 
if __name__ == '__main__':
slim_main()

This works amazingly awesome! I had to run git gc --aggressive --prune to get it to take effect, but after that it was magic!

Mega thanks!

Great! Glad you found it helpful.

On 14 February 2012 04:30, Amir Rubin
reply@reply.github.com
wrote:

This works amazingly awesome!  Mega thanks!


Reply to this email directly or view it on GitHub:
https://gist.github.com/1433794

Thanks for the little script. It made this timely process somewhat easier Thanks again.

Dustin

This exploded for me:

Garbage collecting: done
Repacking: done
Listing pack blobs: done
Traceback (most recent call last):
  File "/Users/benson/bin/git-slim.py", line 208, in <module>
    slim_main()
  File "/Users/benson/bin/git-slim.py", line 40, in slim_main
    slim(repo_dir)
  File "/Users/benson/bin/git-slim.py", line 49, in slim
    slim_blobs(r)
  File "/Users/benson/bin/git-slim.py", line 73, in slim_blobs
    index = blob_index(r)
  File "/Users/benson/bin/git-slim.py", line 102, in blob_index
    commits = list(r.iter_commits())
  File "/Library/Python/2.7/site-packages/git/repo/base.py", line 423, in iter_commits
    rev = self.head.commit
  File "/Library/Python/2.7/site-packages/git/refs/symbolic.py", line 168, in _get_commit
    obj = self._get_object()
  File "/Library/Python/2.7/site-packages/git/refs/symbolic.py", line 161, in _get_object
    return Object.new_from_sha(self.repo, hex_to_bin(self.dereference_recursive(self.repo, self.path)))
  File "/Library/Python/2.7/site-packages/git/objects/base.py", line 64, in new_from_sha
    oinfo = repo.odb.info(sha1)
  File "/Library/Python/2.7/site-packages/gitdb/db/base.py", line 256, in info
    return self._db_query(sha).info(sha)
  File "/Library/Python/2.7/site-packages/gitdb/db/loose.py", line 162, in info
    m = self._map_loose_object(sha)
  File "/Library/Python/2.7/site-packages/gitdb/db/loose.py", line 146, in _map_loose_object
    raise BadObject(sha)
gitdb.exc.BadObject: BadObject: 59a968e4b7bf20039a9314c383a7bb5aa955b53c

Quick look at the code: Won’t it only run git filter-branch on the current branch instead of the whole repo?

And since this is such a potentially destructive script, you might want to add a notice about that.

Please sign in to comment on this gist.

Something went wrong with that request. Please try again.