-
-
Save shreevatsa/ed7c6661ebfa43d8427639f672804c6b to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Prints the git objects taking up most space, aggregating blobs by filename. | |
Background: git stores its objects in the .git/objects directory. | |
This includes "loose" objects, and one or more "packfiles". | |
Calling `git gc` (or `git repack`) packs up the (non-useless) loose objects. | |
`git verify-pack -v` shows the objects packed in the pack file. | |
Objects: | |
An object is one of {blob, tree, commit, tag}. | |
A blob contains the contents of a file. | |
A tree contains references (sha1) to blobs and other trees. | |
A commit contains an author, committer, message, a reference to a tree, and references to parent commit(s). | |
You can view an object with: git cat-file -p <object_sha1> | |
""" | |
from __future__ import unicode_literals | |
# import codecs | |
import collections | |
import fnmatch | |
import glob | |
import logging | |
import re | |
import subprocess | |
def object_line_re(): | |
"""Regex matching the object lines from `git verify-pack -v`: | |
SHA-1 type size size-in-pack-file offset-in-packfile | |
SHA-1 type size size-in-pack-file offset-in-packfile depth base-SHA-1 | |
For example: | |
8c0566d2992b4b8900cecb552e7ebe43a80e0a94 commit 114 117 20692225 1 1a278471eb3c6584a3e94e0d977775882ced0407 | |
afe561a8a28d088c4259a4cbc3a5b6299eebf7a2 blob 134 113 20692342 | |
600d9685d79d1f5da591889bd41747d41bb8e28f blob 19 30 20692455 1 afe561a8a28d088c4259a4cbc3a5b6299eebf7a2 | |
""" | |
basic_regexes = { | |
'sha1_re': r'[0-9a-f]{40}', | |
'type_re': r'(commit|blob |tree |tag )', | |
'num_re': r'[0-9]{1,}' | |
} | |
field_regexes = { | |
'object': r'(?P<object_sha1>{sha1_re})'.format(**basic_regexes), | |
'type': r'(?P<object_type>{type_re})'.format(**basic_regexes), | |
'orig_size': r'(?P<orig_size>{num_re})'.format(**basic_regexes), | |
'packed_size': r'(?P<packed_size>{num_re})'.format(**basic_regexes), | |
'offset': r'(?P<offset>{num_re})'.format(**basic_regexes), | |
'depth': r'(?P<depth>{num_re})'.format(**basic_regexes), | |
'base': r'(?P<base_object>{sha1_re})'.format(**basic_regexes), | |
} | |
line_re = '^{object} {type} {orig_size} {packed_size} {offset}( {depth})?( {base})?$'.format(**field_regexes) | |
return line_re | |
def re_match(pattern, string): | |
return re.match('^' + pattern + '$', string) | |
def parse_object_line(s): | |
assert isinstance(s, str), (type(s), s) | |
assert len(s) > 0 | |
m = re_match(object_line_re(), s) | |
assert m, 'No match for #%s#' % s | |
return { | |
'sha1': m.group('object_sha1'), | |
'type': m.group('object_type'), | |
'orig_size': int(m.group('orig_size')), | |
'packed_size': int(m.group('packed_size')), | |
'offset': int(m.group('offset')), | |
'depth': None if m.group('depth') == None else int(m.group('depth')), | |
'base': m.group('base_object') | |
} | |
def non_object_line(s): | |
num_re = r'[0-9]{1,}' | |
sha1_re = r'[0-9a-f]{40}' | |
return (re_match('non delta: {num} objects'.format(num=num_re), s) or | |
re_match('chain length = {num}: {num} objects?'.format(num=num_re), s) or | |
re_match('.git/objects/pack/pack-{sha}.pack: ok'.format(sha=sha1_re), s)) | |
def objects_from_verify_pack(lines): | |
ret = [] | |
for line in lines: | |
if non_object_line(line): | |
continue | |
obj = parse_object_line(line) | |
if ret: | |
assert ret[-1]['offset'] + ret[-1]['packed_size'] == obj['offset'], (ret[-1], obj) | |
assert obj['type'] in ['tag ', 'commit', 'blob ', 'tree '], '#%s#' % obj['type'] | |
ret.append(obj) | |
return ret | |
def index_blob_names(lines): | |
"""Takes output from rev-list, and maps blob names to file names.""" | |
ret = {} | |
for line in lines: | |
parts = line.split(' ', 1) | |
# Some blobs are unreachable, so line has just a sha1. | |
if len(parts) > 1: | |
assert len(parts) == 2, parts | |
# if len(parts[1].split()) > 1: | |
# print('filename with spaces:', line) | |
ret[parts[0]] = parts[1].strip() | |
return ret | |
def normalize_filename(name): | |
# Normalization 1: glob patterns | |
glob_patterns = ['logs/.nfs*', '*node_modules*', '*.png', '*.jpg', '*.sql', 'docs/downloads/*', 'docs/img/*', 'blog/*', 'downloads/*'] | |
for pattern in glob_patterns: | |
if fnmatch.fnmatch(name, pattern): | |
name = pattern | |
# Normalization 2: files we don't care to distinguish | |
same = [['*.sql', 'dump.csv'], ['*.png', '*.jpg']] | |
for equivalence_class in same: | |
if name in equivalence_class: | |
name = ' or '.join(equivalence_class) | |
# if orig_name != name: | |
# print('Normalized %s to %s' % (orig_name, name)) | |
return name | |
def aggregate_sizes_by_filename(objects): | |
# rev_list_lines = codecs.open('git-all-objects.txt', encoding='ascii').readlines() | |
rev_list_lines = run_process(['git', 'rev-list', '--objects', '--all']) | |
blob_names = index_blob_names(rev_list_lines) | |
aggregated_size = {'unnormalized': collections.defaultdict(int), | |
'normalized': collections.defaultdict(int)} | |
total_size = 0 | |
logging.info('Aggregating sizes of files') | |
for obj in objects: | |
size = obj['packed_size'] | |
sha1 = obj['sha1'] | |
if obj['type'] == 'blob ' and sha1 in blob_names: | |
key = blob_names[sha1] | |
else: | |
key = '{0} ({1})'.format(sha1, obj['type'].strip()) | |
aggregated_size['unnormalized'][key] += size | |
aggregated_size['normalized'][normalize_filename(key)] += size | |
total_size += size | |
logging.info('Done.') | |
return (total_size, aggregated_size['unnormalized'], aggregated_size['normalized']) | |
def print_sizes(sizes, total_size, limit=20): | |
cumulative = 0 | |
print('Cumulat Size Filename') | |
for (i, (key, size)) in enumerate(sorted(sizes.items(), key=lambda x: x[1], reverse=True)): | |
if i >= limit: | |
break | |
cumulative += size | |
cumulative_percent = '%6.2f%%' % (cumulative * 100.0 / total_size) | |
padded_size = '%10d' % size | |
print('%s %s %s' % (cumulative_percent, padded_size, key)) | |
def run_process(cmd_parts): | |
logging.info('Running: %s', ' '.join(cmd_parts)) | |
return subprocess.check_output(cmd_parts).decode().splitlines() | |
if __name__ == '__main__': | |
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(message)s', datefmt='%H:%M:%S') | |
run_process(['git', 'gc']) | |
# verify_pack_lines = codecs.open('git-verify-pack-unsorted', encoding='ascii').readlines() | |
verify_pack_lines = run_process(['git', 'verify-pack', '-v'] + | |
glob.glob('.git/objects/pack/pack-*.idx')) | |
logging.info('Parsing the output') | |
objects = objects_from_verify_pack(verify_pack_lines) | |
# objects.sort(cmp=lambda x, y: cmp(x['packed_size'], y['packed_size']), reverse=True) | |
(total_size, unnormalized, normalized) = aggregate_sizes_by_filename(objects) | |
print('\nUnnormalized:') | |
print_sizes(unnormalized, total_size) | |
print('\nNormalized:') | |
print_sizes(normalized, total_size) |
Thank you, and you're welcome! (Sorry for the delay in replying…)
I should probably update this to Python 3; will do it sometime. Glad it helped you!
(Edit 2023-02-02: Updated to Python 3 as I just had reason to run this script again myself.)
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
so useful! anyone else using it for python3: after piping through 2to3 I altered the ascii decode to a no arg decode(). The regex can also fail due to shell language - setting os.environment to LANG=C helps.
Others won't need this, but I used your tool to look at bare repos (mirrored repos), what needed adjusting of the regex and glob on
objects/pack/pack-
. Thank you for the tool.