Skip to content

Instantly share code, notes, and snippets.

@orangeblock
Last active December 23, 2018 16:15
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save orangeblock/8a4ad3e64bc9b37cbba0de3a7a976881 to your computer and use it in GitHub Desktop.
Save orangeblock/8a4ad3e64bc9b37cbba0de3a7a976881 to your computer and use it in GitHub Desktop.
Customizable concurrent URL downloader. Works with Python 2.7x and 3.x, only using core libraries. Can be ran as a command line script or used as a library. Simply pass a list of URLs to download_files. For the rest simply read the docstring or --help from the CL.
#!/usr/bin/python
import os
import sys
import csv
import time
import shutil
from multiprocessing.dummy import Pool as ThreadPool
from collections import Iterable
try:
from urllib2 import urlopen
from urlparse import urlparse
from urllib import unquote
except ImportError:
from urllib.parse import unquote, urlparse
from urllib.request import urlopen
try:
basestring
except NameError:
basestring = str
def _download_files(tups, dest, concurrency, chunk_size, verbose, _timeout):
def worker(url, filename=None):
if filename is None:
filename = os.path.basename(urlparse(unquote(url)).path)
if not filename:
raise ValueError("Could not extract filename from URL")
req = urlopen(url)
path = os.path.join(dest, filename)
with open(path, 'wb') as f:
shutil.copyfileobj(req, f, chunk_size)
if verbose:
sys.stdout.write('[+] %s -> %s\n' % (url, filename))
return filename, os.path.abspath(path)
pool = ThreadPool(concurrency)
result = pool.map_async(lambda args: worker(*args), tups)
# async allows interrupts to be triggered
while not result.ready():
time.sleep(_timeout)
return result.get()
def download_files(xs, dest='.', concurrency=10, chunk_size=128*1024, verbose=False, _timeout=.1):
"""
Spawn a pool of workers that will download the given urls.
`xs`: List of strings or a list of tuples in the format (url, [filename]).
Strings have to be the urls to download. In the tuple format you can also
specify the filename for the saved file. If None, or missing, it is extracted
from the URL.
`dest`: The directory to save the downloaded files into. Can be relative or absolute.
`concurrency`: The number of threads to run in parallel.
`chunk_size`: The max size of the request chunk, in bytes.
`verbose`: If True, logs additional data.
`_timeout`: The amount to sleep between checks for completion.
Returns a list of tuples in the format (filename, full_path_to_file)
"""
if type(xs) != list:
raise ValueError('Input must be a list')
tups = []
for x in xs:
if isinstance(x, basestring):
tups.append((x, None))
elif isinstance(x, Iterable):
if len(x) not in [1,2]:
raise ValueError('Invalid tuple size: %s' % len(x))
tups.append((x[0], x[1] if len(x) == 2 else None))
return _download_files(tups, dest, concurrency, chunk_size, verbose, _timeout)
if __name__ == '__main__':
import argparse
def _valid_dir(maybe_dir):
if not os.path.isdir(maybe_dir):
raise argparse.ArgumentTypeError('%s is not a valid directory' % maybe_dir)
if not os.access(maybe_dir, os.W_OK):
raise argparse.ArgumentTypeError('Cannot write to %s' % maybe_dir)
return maybe_dir
parser = argparse.ArgumentParser()
parser.add_argument('urls', type=argparse.FileType('r'),
help='File with a url on each line, optionally with a target filename separated by a comma')
parser.add_argument('dest', nargs='?', default=os.getcwd(), type=_valid_dir,
help='Destination directory to download the files into (defaults to current)')
parser.add_argument('-t', '--threads', type=int, default=10,
help='Number of parallel threads to run (default=%(default)s)')
parser.add_argument('-c', '--chunk-size', type=int, default=128*1024,
help='Size of chunk to download at a time, in bytes (default=%(default)s)')
args = parser.parse_args()
download_files(list(csv.reader(args.urls)), args.dest, args.threads, args.chunk_size, verbose=True)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment