shreevatsa/pack-stats.py Secret

## pack-stats.py
"""Prints the git objects taking up most space, aggregating blobs by filename.

Background: git stores its objects in the .git/objects directory.
This includes "loose" objects, and one or more "packfiles".
Calling `git gc` (or `git repack`) packs up the (non-useless) loose objects.

`git verify-pack -v` shows the objects packed in the pack file.

Objects:
An object is one of {blob, tree, commit, tag}.
A blob contains the contents of a file.
A tree contains references (sha1) to blobs and other trees.
A commit contains an author, committer, message, a reference to a tree, and references to parent commit(s).
You can view an object with: git cat-file -p <object_sha1>
"""

from __future__ import unicode_literals

# import codecs
import collections
import fnmatch
import glob
import logging
import re
import subprocess


def object_line_re():
    """Regex matching the object lines from `git verify-pack -v`:
           SHA-1 type size size-in-pack-file offset-in-packfile
           SHA-1 type size size-in-pack-file offset-in-packfile depth base-SHA-1
    For example:
8c0566d2992b4b8900cecb552e7ebe43a80e0a94 commit 114 117 20692225 1 1a278471eb3c6584a3e94e0d977775882ced0407
afe561a8a28d088c4259a4cbc3a5b6299eebf7a2 blob   134 113 20692342
600d9685d79d1f5da591889bd41747d41bb8e28f blob   19 30 20692455 1 afe561a8a28d088c4259a4cbc3a5b6299eebf7a2
    """
    basic_regexes = {
        'sha1_re': r'[0-9a-f]{40}',
        'type_re': r'(commit|blob  |tree  |tag   )',
        'num_re':  r'[0-9]{1,}'
    }
    field_regexes = {
        'object':      r'(?P<object_sha1>{sha1_re})'.format(**basic_regexes),
        'type':        r'(?P<object_type>{type_re})'.format(**basic_regexes),
        'orig_size':   r'(?P<orig_size>{num_re})'.format(**basic_regexes),
        'packed_size': r'(?P<packed_size>{num_re})'.format(**basic_regexes),
        'offset':      r'(?P<offset>{num_re})'.format(**basic_regexes),
        'depth':       r'(?P<depth>{num_re})'.format(**basic_regexes),
        'base':        r'(?P<base_object>{sha1_re})'.format(**basic_regexes),
    }
    line_re = '^{object} {type} {orig_size} {packed_size} {offset}( {depth})?( {base})?$'.format(**field_regexes)
    return line_re


def re_match(pattern, string):
    return re.match('^' + pattern + '$', string)


def parse_object_line(s):
    assert isinstance(s, str), (type(s), s)
    assert len(s) > 0
    m = re_match(object_line_re(), s)
    assert m, 'No match for #%s#' % s
    return {
        'sha1': m.group('object_sha1'),
        'type': m.group('object_type'),
        'orig_size': int(m.group('orig_size')),
        'packed_size': int(m.group('packed_size')),
        'offset': int(m.group('offset')),
        'depth': None if m.group('depth') == None else int(m.group('depth')),
        'base': m.group('base_object')
    }


def non_object_line(s):
    num_re = r'[0-9]{1,}'
    sha1_re = r'[0-9a-f]{40}'
    return (re_match('non delta: {num} objects'.format(num=num_re), s) or
            re_match('chain length = {num}: {num} objects?'.format(num=num_re), s) or
            re_match('.git/objects/pack/pack-{sha}.pack: ok'.format(sha=sha1_re), s))


def objects_from_verify_pack(lines):
    ret = []
    for line in lines:
        if non_object_line(line):
            continue
        obj = parse_object_line(line)
        if ret:
            assert ret[-1]['offset'] + ret[-1]['packed_size'] == obj['offset'], (ret[-1], obj)
        assert obj['type'] in ['tag   ', 'commit', 'blob  ', 'tree  '], '#%s#' % obj['type']
        ret.append(obj)
    return ret


def index_blob_names(lines):
    """Takes output from rev-list, and maps blob names to file names."""
    ret = {}
    for line in lines:
        parts = line.split(' ', 1)
        # Some blobs are unreachable, so line has just a sha1.
        if len(parts) > 1:
            assert len(parts) == 2, parts
            # if len(parts[1].split()) > 1:
            #     print('filename with spaces:', line)
            ret[parts[0]] = parts[1].strip()
    return ret


def normalize_filename(name):
    # Normalization 1: glob patterns
    glob_patterns = ['logs/.nfs*', '*node_modules*', '*.png', '*.jpg', '*.sql', 'docs/downloads/*', 'docs/img/*', 'blog/*', 'downloads/*']
    for pattern in glob_patterns:
        if fnmatch.fnmatch(name, pattern):
            name = pattern
    # Normalization 2: files we don't care to distinguish
    same = [['*.sql', 'dump.csv'], ['*.png', '*.jpg']]
    for equivalence_class in same:
        if name in equivalence_class:
            name = ' or '.join(equivalence_class)
    # if orig_name != name:
    #   print('Normalized %s to %s' % (orig_name, name))
    return name


def aggregate_sizes_by_filename(objects):
    # rev_list_lines = codecs.open('git-all-objects.txt', encoding='ascii').readlines()
    rev_list_lines = run_process(['git', 'rev-list', '--objects', '--all'])
    blob_names = index_blob_names(rev_list_lines)

    aggregated_size = {'unnormalized': collections.defaultdict(int),
                       'normalized': collections.defaultdict(int)}
    total_size = 0

    logging.info('Aggregating sizes of files')
    for obj in objects:
        size = obj['packed_size']
        sha1 = obj['sha1']
        if obj['type'] == 'blob  ' and sha1 in blob_names:
            key = blob_names[sha1]
        else:
            key = '{0} ({1})'.format(sha1, obj['type'].strip())
        aggregated_size['unnormalized'][key] += size
        aggregated_size['normalized'][normalize_filename(key)] += size
        total_size += size
    logging.info('Done.')
    return (total_size, aggregated_size['unnormalized'], aggregated_size['normalized'])


def print_sizes(sizes, total_size, limit=20):
    cumulative = 0
    print('Cumulat       Size Filename')
    for (i, (key, size)) in enumerate(sorted(sizes.items(), key=lambda x: x[1], reverse=True)):
        if i >= limit:
            break
        cumulative += size
        cumulative_percent = '%6.2f%%' % (cumulative * 100.0 / total_size)
        padded_size = '%10d' % size
        print('%s %s %s' % (cumulative_percent, padded_size, key))


def run_process(cmd_parts):
    logging.info('Running: %s', ' '.join(cmd_parts))
    return subprocess.check_output(cmd_parts).decode().splitlines()


if __name__ == '__main__':
    logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(message)s', datefmt='%H:%M:%S')

    run_process(['git', 'gc'])

    # verify_pack_lines = codecs.open('git-verify-pack-unsorted', encoding='ascii').readlines()
    verify_pack_lines = run_process(['git', 'verify-pack', '-v'] +
                                    glob.glob('.git/objects/pack/pack-*.idx'))
    logging.info('Parsing the output')
    objects = objects_from_verify_pack(verify_pack_lines)
    # objects.sort(cmp=lambda x, y: cmp(x['packed_size'], y['packed_size']), reverse=True)

    (total_size, unnormalized, normalized) = aggregate_sizes_by_filename(objects)
    print('\nUnnormalized:')
    print_sizes(unnormalized, total_size)
    print('\nNormalized:')
    print_sizes(normalized, total_size)
	"""Prints the git objects taking up most space, aggregating blobs by filename.

	Background: git stores its objects in the .git/objects directory.
	This includes "loose" objects, and one or more "packfiles".
	Calling `git gc` (or `git repack`) packs up the (non-useless) loose objects.

	`git verify-pack -v` shows the objects packed in the pack file.

	Objects:
	An object is one of {blob, tree, commit, tag}.
	A blob contains the contents of a file.
	A tree contains references (sha1) to blobs and other trees.
	A commit contains an author, committer, message, a reference to a tree, and references to parent commit(s).
	You can view an object with: git cat-file -p <object_sha1>
	"""

	from __future__ import unicode_literals

	# import codecs
	import collections
	import fnmatch
	import glob
	import logging
	import re
	import subprocess


	def object_line_re():
	"""Regex matching the object lines from `git verify-pack -v`:
	SHA-1 type size size-in-pack-file offset-in-packfile
	SHA-1 type size size-in-pack-file offset-in-packfile depth base-SHA-1
	For example:
	8c0566d2992b4b8900cecb552e7ebe43a80e0a94 commit 114 117 20692225 1 1a278471eb3c6584a3e94e0d977775882ced0407
	afe561a8a28d088c4259a4cbc3a5b6299eebf7a2 blob 134 113 20692342
	600d9685d79d1f5da591889bd41747d41bb8e28f blob 19 30 20692455 1 afe561a8a28d088c4259a4cbc3a5b6299eebf7a2
	"""
	basic_regexes = {
	'sha1_re': r'[0-9a-f]{40}',
	'type_re': r'(commit\|blob \|tree \|tag )',
	'num_re': r'[0-9]{1,}'
	}
	field_regexes = {
	'object': r'(?P<object_sha1>{sha1_re})'.format(**basic_regexes),
	'type': r'(?P<object_type>{type_re})'.format(**basic_regexes),
	'orig_size': r'(?P<orig_size>{num_re})'.format(**basic_regexes),
	'packed_size': r'(?P<packed_size>{num_re})'.format(**basic_regexes),
	'offset': r'(?P<offset>{num_re})'.format(**basic_regexes),
	'depth': r'(?P<depth>{num_re})'.format(**basic_regexes),
	'base': r'(?P<base_object>{sha1_re})'.format(**basic_regexes),
	}
	line_re = '^{object} {type} {orig_size} {packed_size} {offset}( {depth})?( {base})?$'.format(**field_regexes)
	return line_re


	def re_match(pattern, string):
	return re.match('^' + pattern + '$', string)


	def parse_object_line(s):
	assert isinstance(s, str), (type(s), s)
	assert len(s) > 0
	m = re_match(object_line_re(), s)
	assert m, 'No match for #%s#' % s
	return {
	'sha1': m.group('object_sha1'),
	'type': m.group('object_type'),
	'orig_size': int(m.group('orig_size')),
	'packed_size': int(m.group('packed_size')),
	'offset': int(m.group('offset')),
	'depth': None if m.group('depth') == None else int(m.group('depth')),
	'base': m.group('base_object')
	}


	def non_object_line(s):
	num_re = r'[0-9]{1,}'
	sha1_re = r'[0-9a-f]{40}'
	return (re_match('non delta: {num} objects'.format(num=num_re), s) or
	re_match('chain length = {num}: {num} objects?'.format(num=num_re), s) or
	re_match('.git/objects/pack/pack-{sha}.pack: ok'.format(sha=sha1_re), s))


	def objects_from_verify_pack(lines):
	ret = []
	for line in lines:
	if non_object_line(line):
	continue
	obj = parse_object_line(line)
	if ret:
	assert ret[-1]['offset'] + ret[-1]['packed_size'] == obj['offset'], (ret[-1], obj)
	assert obj['type'] in ['tag ', 'commit', 'blob ', 'tree '], '#%s#' % obj['type']
	ret.append(obj)
	return ret


	def index_blob_names(lines):
	"""Takes output from rev-list, and maps blob names to file names."""
	ret = {}
	for line in lines:
	parts = line.split(' ', 1)
	# Some blobs are unreachable, so line has just a sha1.
	if len(parts) > 1:
	assert len(parts) == 2, parts
	# if len(parts[1].split()) > 1:
	# print('filename with spaces:', line)
	ret[parts[0]] = parts[1].strip()
	return ret


	def normalize_filename(name):
	# Normalization 1: glob patterns
	glob_patterns = ['logs/.nfs', 'node_modules', '.png', '.jpg', '.sql', 'docs/downloads/', 'docs/img/', 'blog/', 'downloads/']
	for pattern in glob_patterns:
	if fnmatch.fnmatch(name, pattern):
	name = pattern
	# Normalization 2: files we don't care to distinguish
	same = [['.sql', 'dump.csv'], ['.png', '*.jpg']]
	for equivalence_class in same:
	if name in equivalence_class:
	name = ' or '.join(equivalence_class)
	# if orig_name != name:
	# print('Normalized %s to %s' % (orig_name, name))
	return name


	def aggregate_sizes_by_filename(objects):
	# rev_list_lines = codecs.open('git-all-objects.txt', encoding='ascii').readlines()
	rev_list_lines = run_process(['git', 'rev-list', '--objects', '--all'])
	blob_names = index_blob_names(rev_list_lines)

	aggregated_size = {'unnormalized': collections.defaultdict(int),
	'normalized': collections.defaultdict(int)}
	total_size = 0

	logging.info('Aggregating sizes of files')
	for obj in objects:
	size = obj['packed_size']
	sha1 = obj['sha1']
	if obj['type'] == 'blob ' and sha1 in blob_names:
	key = blob_names[sha1]
	else:
	key = '{0} ({1})'.format(sha1, obj['type'].strip())
	aggregated_size['unnormalized'][key] += size
	aggregated_size['normalized'][normalize_filename(key)] += size
	total_size += size
	logging.info('Done.')
	return (total_size, aggregated_size['unnormalized'], aggregated_size['normalized'])


	def print_sizes(sizes, total_size, limit=20):
	cumulative = 0
	print('Cumulat Size Filename')
	for (i, (key, size)) in enumerate(sorted(sizes.items(), key=lambda x: x[1], reverse=True)):
	if i >= limit:
	break
	cumulative += size
	cumulative_percent = '%6.2f%%' % (cumulative * 100.0 / total_size)
	padded_size = '%10d' % size
	print('%s %s %s' % (cumulative_percent, padded_size, key))


	def run_process(cmd_parts):
	logging.info('Running: %s', ' '.join(cmd_parts))
	return subprocess.check_output(cmd_parts).decode().splitlines()


	if __name__ == '__main__':
	logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(message)s', datefmt='%H:%M:%S')

	run_process(['git', 'gc'])

	# verify_pack_lines = codecs.open('git-verify-pack-unsorted', encoding='ascii').readlines()
	verify_pack_lines = run_process(['git', 'verify-pack', '-v'] +
	glob.glob('.git/objects/pack/pack-*.idx'))
	logging.info('Parsing the output')
	objects = objects_from_verify_pack(verify_pack_lines)
	# objects.sort(cmp=lambda x, y: cmp(x['packed_size'], y['packed_size']), reverse=True)

	(total_size, unnormalized, normalized) = aggregate_sizes_by_filename(objects)
	print('\nUnnormalized:')
	print_sizes(unnormalized, total_size)
	print('\nNormalized:')
	print_sizes(normalized, total_size)