Skip to content

Instantly share code, notes, and snippets.

@almarklein
Created August 23, 2013 19:49
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save almarklein/6323266 to your computer and use it in GitHub Desktop.
Save almarklein/6323266 to your computer and use it in GitHub Desktop.
Some tools for analyzing the files (also deleted files) in the repo and removing them completely from history. Warning: black magic is involved, and the commit hashes will be changed.
#!/usr/bin/env python3
"""
Functionality for cleaning up your git repository.
This uses commands found on:
https://help.github.com/articles/remove-sensitive-data
"""
import os
import sys
import subprocess
def call(cmd):
subprocess.call(cmd, shell=True)
def remove(path):
""" Remove the given path (file or directory) from the history
of this repository.
"""
command = ("git filter-branch --force --index-filter " +
"'git rm -r --cached --ignore-unmatch %s' " +
"--prune-empty --tag-name-filter cat -- --all")
call(command%path)
def clean():
""" Clean the backup branch that was created during the remove step.
"""
call("git update-ref -d refs/original/refs/heads/master")
call("git reflog expire --expire=now --all")
call("git gc --prune=now")
def deleted():
""" Get a list of all deleted objects know by the repo.
"""
subprocess.call("git log --diff-filter=D --summary | grep delete", shell=True)
def all():
""" Get a list of all files that were ever in the repo.
"""
subprocess.call("git rev-list --objects --all | sort -k 2 | cut -f 2 -d\ | uniq", shell=True)
def sizes(maxcount=20):
""" Get a list of the N biggest files in the repo (and show their size in bytes).
"""
# Clear
for fname in ['bigtosmall.txt', 'allfileshas.txt', 'bigobjects.txt']:
if os.path.isfile(fname):
os.remove(fname)
# Git commands
call("git rev-list --objects --all | sort -k 2 > allfileshas.txt")
call('git gc && git verify-pack -v .git/objects/pack/pack-*.idx | egrep "^\w+ blob\W+[0-9]+ [0-9]+ [0-9]+$" | sort -k 3 -n -r > bigobjects.txt')
call("for SHA in `cut -f 1 -d\ < bigobjects.txt`; do\necho $(grep $SHA bigobjects.txt) $(grep $SHA allfileshas.txt) | awk '{print $1,$3,$7}' >> bigtosmall.txt\ndone;")
# Show results
with open('bigtosmall.txt', 'r') as f:
count = 0
for line in f.readlines():
count += 1
print(line.strip().split(' ',1)[1])
if count >= maxcount:
return
if __name__ == '__main__':
if sys.argv[1] == 'remove':
remove(sys.argv[2])
elif sys.argv[1] == 'clean':
clean()
elif sys.argv[1] == 'deleted':
deleted()
elif sys.argv[1] == 'all':
all()
elif sys.argv[1] == 'sizes':
if len(sys.argv) > 2:
sizes(int(sys.argv[2]))
else:
sizes()
else:
print('Invalid command')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment