Skip to content

Instantly share code, notes, and snippets.

@shreevatsa
Last active February 2, 2023 14:18
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 2 You must be signed in to fork a gist
  • Save shreevatsa/ed7c6661ebfa43d8427639f672804c6b to your computer and use it in GitHub Desktop.
Save shreevatsa/ed7c6661ebfa43d8427639f672804c6b to your computer and use it in GitHub Desktop.
"""Prints the git objects taking up most space, aggregating blobs by filename.
Background: git stores its objects in the .git/objects directory.
This includes "loose" objects, and one or more "packfiles".
Calling `git gc` (or `git repack`) packs up the (non-useless) loose objects.
`git verify-pack -v` shows the objects packed in the pack file.
Objects:
An object is one of {blob, tree, commit, tag}.
A blob contains the contents of a file.
A tree contains references (sha1) to blobs and other trees.
A commit contains an author, committer, message, a reference to a tree, and references to parent commit(s).
You can view an object with: git cat-file -p <object_sha1>
"""
from __future__ import unicode_literals
# import codecs
import collections
import fnmatch
import glob
import logging
import re
import subprocess
def object_line_re():
"""Regex matching the object lines from `git verify-pack -v`:
SHA-1 type size size-in-pack-file offset-in-packfile
SHA-1 type size size-in-pack-file offset-in-packfile depth base-SHA-1
For example:
8c0566d2992b4b8900cecb552e7ebe43a80e0a94 commit 114 117 20692225 1 1a278471eb3c6584a3e94e0d977775882ced0407
afe561a8a28d088c4259a4cbc3a5b6299eebf7a2 blob 134 113 20692342
600d9685d79d1f5da591889bd41747d41bb8e28f blob 19 30 20692455 1 afe561a8a28d088c4259a4cbc3a5b6299eebf7a2
"""
basic_regexes = {
'sha1_re': r'[0-9a-f]{40}',
'type_re': r'(commit|blob |tree |tag )',
'num_re': r'[0-9]{1,}'
}
field_regexes = {
'object': r'(?P<object_sha1>{sha1_re})'.format(**basic_regexes),
'type': r'(?P<object_type>{type_re})'.format(**basic_regexes),
'orig_size': r'(?P<orig_size>{num_re})'.format(**basic_regexes),
'packed_size': r'(?P<packed_size>{num_re})'.format(**basic_regexes),
'offset': r'(?P<offset>{num_re})'.format(**basic_regexes),
'depth': r'(?P<depth>{num_re})'.format(**basic_regexes),
'base': r'(?P<base_object>{sha1_re})'.format(**basic_regexes),
}
line_re = '^{object} {type} {orig_size} {packed_size} {offset}( {depth})?( {base})?$'.format(**field_regexes)
return line_re
def re_match(pattern, string):
return re.match('^' + pattern + '$', string)
def parse_object_line(s):
assert isinstance(s, str), (type(s), s)
assert len(s) > 0
m = re_match(object_line_re(), s)
assert m, 'No match for #%s#' % s
return {
'sha1': m.group('object_sha1'),
'type': m.group('object_type'),
'orig_size': int(m.group('orig_size')),
'packed_size': int(m.group('packed_size')),
'offset': int(m.group('offset')),
'depth': None if m.group('depth') == None else int(m.group('depth')),
'base': m.group('base_object')
}
def non_object_line(s):
num_re = r'[0-9]{1,}'
sha1_re = r'[0-9a-f]{40}'
return (re_match('non delta: {num} objects'.format(num=num_re), s) or
re_match('chain length = {num}: {num} objects?'.format(num=num_re), s) or
re_match('.git/objects/pack/pack-{sha}.pack: ok'.format(sha=sha1_re), s))
def objects_from_verify_pack(lines):
ret = []
for line in lines:
if non_object_line(line):
continue
obj = parse_object_line(line)
if ret:
assert ret[-1]['offset'] + ret[-1]['packed_size'] == obj['offset'], (ret[-1], obj)
assert obj['type'] in ['tag ', 'commit', 'blob ', 'tree '], '#%s#' % obj['type']
ret.append(obj)
return ret
def index_blob_names(lines):
"""Takes output from rev-list, and maps blob names to file names."""
ret = {}
for line in lines:
parts = line.split(' ', 1)
# Some blobs are unreachable, so line has just a sha1.
if len(parts) > 1:
assert len(parts) == 2, parts
# if len(parts[1].split()) > 1:
# print('filename with spaces:', line)
ret[parts[0]] = parts[1].strip()
return ret
def normalize_filename(name):
# Normalization 1: glob patterns
glob_patterns = ['logs/.nfs*', '*node_modules*', '*.png', '*.jpg', '*.sql', 'docs/downloads/*', 'docs/img/*', 'blog/*', 'downloads/*']
for pattern in glob_patterns:
if fnmatch.fnmatch(name, pattern):
name = pattern
# Normalization 2: files we don't care to distinguish
same = [['*.sql', 'dump.csv'], ['*.png', '*.jpg']]
for equivalence_class in same:
if name in equivalence_class:
name = ' or '.join(equivalence_class)
# if orig_name != name:
# print('Normalized %s to %s' % (orig_name, name))
return name
def aggregate_sizes_by_filename(objects):
# rev_list_lines = codecs.open('git-all-objects.txt', encoding='ascii').readlines()
rev_list_lines = run_process(['git', 'rev-list', '--objects', '--all'])
blob_names = index_blob_names(rev_list_lines)
aggregated_size = {'unnormalized': collections.defaultdict(int),
'normalized': collections.defaultdict(int)}
total_size = 0
logging.info('Aggregating sizes of files')
for obj in objects:
size = obj['packed_size']
sha1 = obj['sha1']
if obj['type'] == 'blob ' and sha1 in blob_names:
key = blob_names[sha1]
else:
key = '{0} ({1})'.format(sha1, obj['type'].strip())
aggregated_size['unnormalized'][key] += size
aggregated_size['normalized'][normalize_filename(key)] += size
total_size += size
logging.info('Done.')
return (total_size, aggregated_size['unnormalized'], aggregated_size['normalized'])
def print_sizes(sizes, total_size, limit=20):
cumulative = 0
print('Cumulat Size Filename')
for (i, (key, size)) in enumerate(sorted(sizes.items(), key=lambda x: x[1], reverse=True)):
if i >= limit:
break
cumulative += size
cumulative_percent = '%6.2f%%' % (cumulative * 100.0 / total_size)
padded_size = '%10d' % size
print('%s %s %s' % (cumulative_percent, padded_size, key))
def run_process(cmd_parts):
logging.info('Running: %s', ' '.join(cmd_parts))
return subprocess.check_output(cmd_parts).decode().splitlines()
if __name__ == '__main__':
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(message)s', datefmt='%H:%M:%S')
run_process(['git', 'gc'])
# verify_pack_lines = codecs.open('git-verify-pack-unsorted', encoding='ascii').readlines()
verify_pack_lines = run_process(['git', 'verify-pack', '-v'] +
glob.glob('.git/objects/pack/pack-*.idx'))
logging.info('Parsing the output')
objects = objects_from_verify_pack(verify_pack_lines)
# objects.sort(cmp=lambda x, y: cmp(x['packed_size'], y['packed_size']), reverse=True)
(total_size, unnormalized, normalized) = aggregate_sizes_by_filename(objects)
print('\nUnnormalized:')
print_sizes(unnormalized, total_size)
print('\nNormalized:')
print_sizes(normalized, total_size)
@tcecyk
Copy link

tcecyk commented Dec 11, 2022

so useful! anyone else using it for python3: after piping through 2to3 I altered the ascii decode to a no arg decode(). The regex can also fail due to shell language - setting os.environment to LANG=C helps.
Others won't need this, but I used your tool to look at bare repos (mirrored repos), what needed adjusting of the regex and glob on objects/pack/pack-. Thank you for the tool.

@shreevatsa
Copy link
Author

shreevatsa commented Dec 17, 2022

Thank you, and you're welcome! (Sorry for the delay in replying…)

I should probably update this to Python 3; will do it sometime. Glad it helped you!

(Edit 2023-02-02: Updated to Python 3 as I just had reason to run this script again myself.)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment