Navigation Menu

Skip to content

Instantly share code, notes, and snippets.

@MatthewWilkes
Created January 24, 2019 16:30
Show Gist options
  • Star 4 You must be signed in to star a gist
  • Fork 2 You must be signed in to fork a gist
  • Save MatthewWilkes/da2c43a407b3c3a8e6bff8a8cc0caa93 to your computer and use it in GitHub Desktop.
Save MatthewWilkes/da2c43a407b3c3a8e6bff8a8cc0caa93 to your computer and use it in GitHub Desktop.
Extract deleted commits from a GitHub repo
import argparse
import os
import re
import subprocess
import tempfile
import requests
def get_repo(owner, repo):
"""Check the repo out to a temporary directory"""
tempdir = tempfile.mkdtemp()
url = "http://github.com/%s/%s.git" % (owner, repo)
subprocess.check_output(['git', 'clone', url, tempdir])
return tempdir
def get_shas(checkout):
"""List all reachable commits from a checkout"""
os.chdir(checkout)
commits = subprocess.check_output(['git', 'repack'])
packdir = os.path.join(checkout, '.git', 'objects', 'pack')
packfile = [filename for filename in os.listdir(packdir) if '.idx' in filename]
packfile = os.path.join(packdir, packfile[0])
shas = subprocess.check_output(['git', 'verify-pack', '-v', packfile])
shas = {sha[:4] for sha in shas.split("\n") if 'object' not in sha and sha}
return shas
def potential_hashes(checkout, shas=None):
"""Get all 4 digit hashes that aren't in our known list"""
if shas is None:
shas = get_shas(checkout)
all_possibles = {'%04x' % possible for possible in xrange(16**4)}
candidates = all_possibles - shas
return candidates
def get_parents_for_commit(owner, repo, commit):
"""Get the parents from the API. We can't use git as it won't show us hidden
commits, but we have the full hash so we can use the API and avoid hitting
the frontend"""
url = "https://api.github.com/repos/%s/%s/commits/%s" % (owner, repo, commit)
response = requests.get(url).json()
for parent in response['parents']:
yield parent['sha']
def get_all_ancestors(owner, repo, commit, break_on=None):
"""Given a hash, iterate over all of its ancestors, with an optional
set of commits we have seen already to ignore."""
if break_on is None:
break_on = set()
# Recurse over the parents, so we can follow both sides of a merge
parents = get_parents_for_commit(owner, repo, commit)
for parent in parents:
# We've seen this commit before, stop processing
if parent[:4] not in break_on:
yield parent
for ancestor in get_all_ancestors(owner, repo, parent, break_on):
yield ancestor
def check_candidate(owner, repo, candidate, known_shas):
"""Given repo information and a 4 digit prefix, try and find that commit,
then iterate over its ancestors if it exists. Otherwise return an empty
iterator"""
url = 'https://github.com/%s/%s/commit/%s' % (owner, repo, candidate)
response = requests.get(url)
if response.status_code == 200:
# This is a real commit, find the full hash
full_hash = re.compile('/commit/(%s.*?)\"' % candidate).findall(response.text)[0]
# Make a note of the prefix, so we don't re-scan this tree
known_shas.add(full_hash[:4])
yield full_hash
# Yield from all ancestors of this commit, until we hit a known commit
for ancestor in get_all_ancestors(owner, repo, full_hash, known_shas):
known_shas.add(ancestor[:4])
yield ancestor
else:
return
def main():
parser = argparse.ArgumentParser(description='Find unreachable commits')
parser.add_argument('owner', type=str,
help='the repository owner')
parser.add_argument('repo', type=str,
help='the repository name')
args = parser.parse_args()
# Get the git repo and find the candidates we will check for
repository = get_repo(args.owner, args.repo)
print "Extracting known identifiers"
known_shas = get_shas(repository)
potential_commits = potential_hashes(repository, shas=known_shas)
print "There are %d potential commits to check for" % (len(potential_commits))
# Keep track of the last completion percentage, so we can report to the user during big scans
percentage_complete = 0
for i, candidate in enumerate(potential_commits):
current_percentage = round(((i+1.0) / len(potential_commits)) * 100)
# Loop over ancestors for a 4 digit hash. If the hash doesn't exist, no ancestors are returned
for parent in check_candidate(args.owner, args.repo, candidate, known_shas):
print "Found hidden commit https://github.com/%s/%s/commit/%s" % (args.owner, args.repo, parent)
if current_percentage > percentage_complete:
print "%d%% complete" % (int(current_percentage))
percentage_complete = current_percentage
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment