Skip to content

Instantly share code, notes, and snippets.

@panzerama
Created December 3, 2018 23:30
Show Gist options
  • Save panzerama/b46a977ae1acf31a5a941bab7db00a70 to your computer and use it in GitHub Desktop.
Save panzerama/b46a977ae1acf31a5a941bab7db00a70 to your computer and use it in GitHub Desktop.
A Python script for visualizing the size of objects in your git repo.
#!/usr/bin/env python3
"""
Counts file size of objects in your repository's packfile and sorts by
buckets. Defaults to buckets of 1M.
Requires third-party libraries matplotlib and numpy
Author: Jason Drew Panzer
"""
from matplotlib import pyplot as plt
from subprocess import run
import numpy as np
def packfile_histogram():
# git verify-pack for all pack index files, filtered by blobs and sorted by size in reverse order
objects = run('git verify-pack -v .git/objects/pack/pack-*.idx | grep blob | grep -v chain | sort -k3nr', shell=True, capture_output=True, encoding='utf-8')
objects = objects.stdout.split('\n')[:-1]
# reshape blob sizes for numpy.histogram
object_sizes = [(int(x.split()[2])/(1024**2)) for x in objects]
object_size_hist, object_size_bins = np.histogram(object_sizes, bins=50)
#show off that histogram
plt.bar(object_size_bins[:-1], object_size_hist, width=1)
plt.xlabel("size in mb")
plt.xlim(1, max(object_size_bins))
plt.ylabel("number of objects")
plt.ylim(0, max(object_size_hist[1:]))
plt.show()
if __name__ == '__main__':
packfile_histogram()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment