Skip to content

Instantly share code, notes, and snippets.

@keithstellyes
Created October 8, 2019 19:47
Show Gist options
  • Save keithstellyes/a3e5f0208f82964ff5fa467e41877872 to your computer and use it in GitHub Desktop.
Save keithstellyes/a3e5f0208f82964ff5fa467e41877872 to your computer and use it in GitHub Desktop.
A script for downloading just a directory off of Github
#!/usr/bin/env python3
import shutil, os, sys, json, requests, getpass
def ensure_dir(dir):
try:
os.mkdir(dir)
except FileExistsError:
# It's OK if dir doesn't exist, it's just cleaner for code to have an
# "ensure" function
pass
'''
Returns a list of files and directories on an api.github.com/repos/ URL.
On failure it will return a tuple.
'''
def gh_ls(url, auth=None):
if url.lstrip('https://').startswith('github'):
return (-1, 'Invalid Repo API URL, for example should look like:' +
'https://api.github.com/repos/qt/qtbase/contents/examples ' +
'For the examples/ dir, on the repo qt/qtbase, its normal URL ' +
'being https://github.com/qt/qtbase/tree/dev/examples')
# Ideally, we have some sort of re-try logic, however such logic needs to
# be smart enough to not keep pounding GitHub on API Rate Limits being
# exceeded.
r = requests.get(url, auth=auth)
if r.status_code != 200:
return r.status_code, r.text
resp = {}
files_dirs = json.loads(r.text)
for fd in files_dirs:
# delete keys we don't care about to reduce noise
del fd['url']
del fd['git_url']
del fd['sha']
del fd['_links']
if fd['type'] not in resp.keys():
resp[fd['type']] = []
resp[fd['type']].append(fd)
return resp
if __name__ == '__main__':
url = sys.argv[1]
auth = None
if len(sys.argv) > 2:
auth = (sys.argv[2], sys.argv[3])
else:
auth = (input('GitHub username:'), getpass.getpass())
q = [(url, os.curdir)]
while q != []:
nxt = q.pop()
url = nxt[0]
curdir = nxt[1]
fd = gh_ls(url, auth)
entries = []
if type(fd) != dict:
print('Error downloading! Assuming I should exist... Details:', str(fd))
sys.exit(1)
if 'file' in fd.keys():
for f in fd['file']: entries.append(curdir + '/' + str(f['name']))
if 'dir' in fd.keys():
for d in fd['dir']:
entries.append(curdir + '/' + d['name'] + '/')
print('\n'.join(sorted(entries)))
if 'dir' in fd.keys():
for d in fd['dir']:
ensure_dir(curdir + '/' + d['name'])
q.append((url + '/' + d['name'], curdir + '/' + d['name']))
if 'file' not in fd.keys():
continue
for f in fd['file']:
fpath = curdir + '/' + f['name']
print('Trying to download:', fpath)
if os.path.exists(fpath):
print(fpath, 'already exists! Skipping...')
continue
if f['download_url'] is None:
print('{}/{} did not have a download url as expected!'.format(curdir,
f['name']))
dl_url = f['download_url']
## Requests is a little ugly for downloading non-text data, TBH ##
# stream=True is necessary for downloading binary data
r = requests.get(dl_url, stream=True)
if r.status_code != 200:
# We exit here since it's presumed if the dl fails then things
# are broken beyond our control.
print('Failed to download', dl_url)
sys.exit(1)
# GitHub likes sending responses as zip blobs it seems,
# so we need requests to decompress
r.raw.decode_content = True
with open(fpath, 'wb') as f:
shutil.copyfileobj(r.raw, f)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment