Created
October 8, 2019 19:47
-
-
Save keithstellyes/a3e5f0208f82964ff5fa467e41877872 to your computer and use it in GitHub Desktop.
A script for downloading just a directory off of Github
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import shutil, os, sys, json, requests, getpass | |
def ensure_dir(dir): | |
try: | |
os.mkdir(dir) | |
except FileExistsError: | |
# It's OK if dir doesn't exist, it's just cleaner for code to have an | |
# "ensure" function | |
pass | |
''' | |
Returns a list of files and directories on an api.github.com/repos/ URL. | |
On failure it will return a tuple. | |
''' | |
def gh_ls(url, auth=None): | |
if url.lstrip('https://').startswith('github'): | |
return (-1, 'Invalid Repo API URL, for example should look like:' + | |
'https://api.github.com/repos/qt/qtbase/contents/examples ' + | |
'For the examples/ dir, on the repo qt/qtbase, its normal URL ' + | |
'being https://github.com/qt/qtbase/tree/dev/examples') | |
# Ideally, we have some sort of re-try logic, however such logic needs to | |
# be smart enough to not keep pounding GitHub on API Rate Limits being | |
# exceeded. | |
r = requests.get(url, auth=auth) | |
if r.status_code != 200: | |
return r.status_code, r.text | |
resp = {} | |
files_dirs = json.loads(r.text) | |
for fd in files_dirs: | |
# delete keys we don't care about to reduce noise | |
del fd['url'] | |
del fd['git_url'] | |
del fd['sha'] | |
del fd['_links'] | |
if fd['type'] not in resp.keys(): | |
resp[fd['type']] = [] | |
resp[fd['type']].append(fd) | |
return resp | |
if __name__ == '__main__': | |
url = sys.argv[1] | |
auth = None | |
if len(sys.argv) > 2: | |
auth = (sys.argv[2], sys.argv[3]) | |
else: | |
auth = (input('GitHub username:'), getpass.getpass()) | |
q = [(url, os.curdir)] | |
while q != []: | |
nxt = q.pop() | |
url = nxt[0] | |
curdir = nxt[1] | |
fd = gh_ls(url, auth) | |
entries = [] | |
if type(fd) != dict: | |
print('Error downloading! Assuming I should exist... Details:', str(fd)) | |
sys.exit(1) | |
if 'file' in fd.keys(): | |
for f in fd['file']: entries.append(curdir + '/' + str(f['name'])) | |
if 'dir' in fd.keys(): | |
for d in fd['dir']: | |
entries.append(curdir + '/' + d['name'] + '/') | |
print('\n'.join(sorted(entries))) | |
if 'dir' in fd.keys(): | |
for d in fd['dir']: | |
ensure_dir(curdir + '/' + d['name']) | |
q.append((url + '/' + d['name'], curdir + '/' + d['name'])) | |
if 'file' not in fd.keys(): | |
continue | |
for f in fd['file']: | |
fpath = curdir + '/' + f['name'] | |
print('Trying to download:', fpath) | |
if os.path.exists(fpath): | |
print(fpath, 'already exists! Skipping...') | |
continue | |
if f['download_url'] is None: | |
print('{}/{} did not have a download url as expected!'.format(curdir, | |
f['name'])) | |
dl_url = f['download_url'] | |
## Requests is a little ugly for downloading non-text data, TBH ## | |
# stream=True is necessary for downloading binary data | |
r = requests.get(dl_url, stream=True) | |
if r.status_code != 200: | |
# We exit here since it's presumed if the dl fails then things | |
# are broken beyond our control. | |
print('Failed to download', dl_url) | |
sys.exit(1) | |
# GitHub likes sending responses as zip blobs it seems, | |
# so we need requests to decompress | |
r.raw.decode_content = True | |
with open(fpath, 'wb') as f: | |
shutil.copyfileobj(r.raw, f) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment