Skip to content

Instantly share code, notes, and snippets.

@mullikine
Created August 28, 2020 01:25
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mullikine/bea516e3726d0a0ab139bca534a43fe9 to your computer and use it in GitHub Desktop.
Save mullikine/bea516e3726d0a0ab139bca534a43fe9 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3.6
# -*- coding: utf-8 -*-
# Download a range of URLs in parallel.
# https://gist.github.com/kylemcdonald/3cbd09752e340849e4b3cb4f12dd8c85
from multiprocessing.dummy import Pool
from tqdm import tqdm
from urllib.parse import urlsplit
import urllib3
import itertools, functools, operator
import os
import sys
import re
import errno
import argparse
parser = argparse.ArgumentParser(description='Download a range or list of urls in parallel from a single domain.')
parser.add_argument('-n', '--n_connections', default=32, type=int, help='Number of parallel connections.')
parser.add_argument('-d', '--dry_run', action='store_true', help='Show the first few URLs to download.')
parser.add_argument('-z', '--zeropad', action='store_true', help='Pad zeros with maximum amount needed.')
parser.add_argument('-x', '--extension', default=None, type=str, help='Extension code, e.g. \'jpg\'.')
group = parser.add_mutually_exclusive_group(required=True)
group.add_argument('-l', '--list', default=sys.stdin, type=argparse.FileType('r'), help='File with a list of urls.')
group.add_argument('-u', '--urls', default='', type=str, help='URL with range(s) of numbers.')
args = parser.parse_args()
def ids_to_str(ids, stop):
if args.zeropad:
max_size = len(stop)
return map(lambda e: str(e).zfill(max_size), ids)
else:
return map(str, ids)
domain = ''
urls = []
ids = []
job_count = 0
scheme = 'http'
if args.urls:
parsed = urlsplit(args.urls)
scheme = parsed.scheme
domain = parsed.netloc
path = parsed.path + ('?' + parsed.query if parsed.query else '')
pieces = re.split('\[(.+?)\]', path)
url_pieces = []
id_pieces = []
for i, piece in enumerate(pieces):
if i % 2 == 0:
url_pieces.append([piece])
else:
start, stop = piece.split('-')
cur_id_pieces = range(int(start), int(stop)+1) # inclusive of end
cur_url_pieces = ids_to_str(cur_id_pieces, stop)
id_pieces.append(cur_id_pieces)
url_pieces.append(cur_url_pieces)
urls = map(lambda x: ''.join(x), itertools.product(*url_pieces))
ids = itertools.product(*id_pieces)
job_count = functools.reduce(operator.mul, map(len, id_pieces), 1)
else:
full_urls = args.list.read().splitlines()
for url in full_urls:
parsed = urlsplit(url)
path = parsed.path + ('?' + parsed.query if parsed.query else '')
pieces = [e for e in path.split('/') if e is not '']
urls.append(path)
ids.append(pieces)
scheme = parsed.scheme
domain = parsed.netloc
job_count = len(urls)
if scheme == 'http':
connection_pool = urllib3.HTTPConnectionPool(domain)
elif scheme == 'https':
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
connection_pool = urllib3.HTTPSConnectionPool(domain, cert_reqs='CERT_NONE')
else:
print(f'Scheme not recognized: {scheme}')
exit()
jobs = zip(urls, ids)
print('Accessing', job_count, 'URLs at', domain, 'across', args.n_connections, 'connections.')
def mkdir_p(path):
try:
os.makedirs(path)
except OSError as exc:
if exc.errno == errno.EEXIST and os.path.isdir(path):
pass
else:
raise
def download(url, fn):
if not os.path.isfile(fn):
r = connection_pool.urlopen('GET', url)
with open(fn, 'wb') as f:
if r.status == 200:
f.write(r.data)
elif r.status != 404:
print('Error {} requesting {}'.format(r.status, url))
def build_filename(ids):
ids = list(map(str, ids))
dir_name = os.path.join(domain, *ids[:-1])
fn = os.path.join(dir_name, ids[-1] + ('.' + args.extension if args.extension else ''))
return dir_name, fn
if args.dry_run:
max_display = 10
for url, ids in itertools.islice(jobs, max_display):
dir_name, fn = build_filename(ids)
print(f'{domain}{url} => {fn}')
if job_count > max_display:
print('... and {} more'.format(job_count - max_display))
else:
pbar = tqdm(total=job_count, leave=True)
def job(cur_job):
url, ids = cur_job
dir_name, fn = build_filename(ids)
mkdir_p(dir_name)
download(url, fn)
pbar.update(1)
pool = Pool(args.n_connections)
pool.map(job, jobs)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment