orangeblock/dler.py

## dler.py
#!/usr/bin/python
import os
import sys
import csv
import time
import shutil

from multiprocessing.dummy import Pool as ThreadPool
from collections import Iterable
try:
    from urllib2 import urlopen
    from urlparse import urlparse
    from urllib import unquote
except ImportError:
    from urllib.parse import unquote, urlparse
    from urllib.request import urlopen
try:
    basestring
except NameError:
    basestring = str


def _download_files(tups, dest, concurrency, chunk_size, verbose, _timeout):
    def worker(url, filename=None):
        if filename is None:
            filename = os.path.basename(urlparse(unquote(url)).path)
        if not filename:
            raise ValueError("Could not extract filename from URL")
        req = urlopen(url)
        path = os.path.join(dest, filename)
        with open(path, 'wb') as f:
            shutil.copyfileobj(req, f, chunk_size)
        if verbose:
            sys.stdout.write('[+] %s -> %s\n' % (url, filename))
        return filename, os.path.abspath(path)
    pool = ThreadPool(concurrency)
    result = pool.map_async(lambda args: worker(*args), tups)
    # async allows interrupts to be triggered
    while not result.ready():
        time.sleep(_timeout)
    return result.get()

def download_files(xs, dest='.', concurrency=10, chunk_size=128*1024, verbose=False, _timeout=.1):
    """
    Spawn a pool of workers that will download the given urls.

    `xs`: List of strings or a list of tuples in the format (url, [filename]).
          Strings have to be the urls to download. In the tuple format you can also
          specify the filename for the saved file. If None, or missing, it is extracted
          from the URL.
    `dest`: The directory to save the downloaded files into. Can be relative or absolute.
    `concurrency`: The number of threads to run in parallel.
    `chunk_size`: The max size of the request chunk, in bytes.
    `verbose`: If True, logs additional data.
    `_timeout`: The amount to sleep between checks for completion.

    Returns a list of tuples in the format (filename, full_path_to_file)
    """
    if type(xs) != list:
        raise ValueError('Input must be a list')
    tups = []
    for x in xs:
        if isinstance(x, basestring):
            tups.append((x, None))
        elif isinstance(x, Iterable):
            if len(x) not in [1,2]:
                raise ValueError('Invalid tuple size: %s' % len(x))
            tups.append((x[0], x[1] if len(x) == 2 else None))
    return _download_files(tups, dest, concurrency, chunk_size, verbose, _timeout)


if __name__ == '__main__':
    import argparse

    def _valid_dir(maybe_dir):
        if not os.path.isdir(maybe_dir):
            raise argparse.ArgumentTypeError('%s is not a valid directory' % maybe_dir)
        if not os.access(maybe_dir, os.W_OK):
            raise argparse.ArgumentTypeError('Cannot write to %s' % maybe_dir)
        return maybe_dir

    parser = argparse.ArgumentParser()
    parser.add_argument('urls', type=argparse.FileType('r'),
                        help='File with a url on each line, optionally with a target filename separated by a comma')
    parser.add_argument('dest', nargs='?', default=os.getcwd(), type=_valid_dir,
                        help='Destination directory to download the files into (defaults to current)')
    parser.add_argument('-t', '--threads', type=int, default=10,
                        help='Number of parallel threads to run (default=%(default)s)')
    parser.add_argument('-c', '--chunk-size', type=int, default=128*1024,
                        help='Size of chunk to download at a time, in bytes (default=%(default)s)')

    args = parser.parse_args()
    download_files(list(csv.reader(args.urls)), args.dest, args.threads, args.chunk_size, verbose=True)
	#!/usr/bin/python
	import os
	import sys
	import csv
	import time
	import shutil

	from multiprocessing.dummy import Pool as ThreadPool
	from collections import Iterable
	try:
	from urllib2 import urlopen
	from urlparse import urlparse
	from urllib import unquote
	except ImportError:
	from urllib.parse import unquote, urlparse
	from urllib.request import urlopen
	try:
	basestring
	except NameError:
	basestring = str


	def _download_files(tups, dest, concurrency, chunk_size, verbose, _timeout):
	def worker(url, filename=None):
	if filename is None:
	filename = os.path.basename(urlparse(unquote(url)).path)
	if not filename:
	raise ValueError("Could not extract filename from URL")
	req = urlopen(url)
	path = os.path.join(dest, filename)
	with open(path, 'wb') as f:
	shutil.copyfileobj(req, f, chunk_size)
	if verbose:
	sys.stdout.write('[+] %s -> %s\n' % (url, filename))
	return filename, os.path.abspath(path)
	pool = ThreadPool(concurrency)
	result = pool.map_async(lambda args: worker(*args), tups)
	# async allows interrupts to be triggered
	while not result.ready():
	time.sleep(_timeout)
	return result.get()

	def download_files(xs, dest='.', concurrency=10, chunk_size=128*1024, verbose=False, _timeout=.1):
	"""
	Spawn a pool of workers that will download the given urls.

	`xs`: List of strings or a list of tuples in the format (url, [filename]).
	Strings have to be the urls to download. In the tuple format you can also
	specify the filename for the saved file. If None, or missing, it is extracted
	from the URL.
	`dest`: The directory to save the downloaded files into. Can be relative or absolute.
	`concurrency`: The number of threads to run in parallel.
	`chunk_size`: The max size of the request chunk, in bytes.
	`verbose`: If True, logs additional data.
	`_timeout`: The amount to sleep between checks for completion.

	Returns a list of tuples in the format (filename, full_path_to_file)
	"""
	if type(xs) != list:
	raise ValueError('Input must be a list')
	tups = []
	for x in xs:
	if isinstance(x, basestring):
	tups.append((x, None))
	elif isinstance(x, Iterable):
	if len(x) not in [1,2]:
	raise ValueError('Invalid tuple size: %s' % len(x))
	tups.append((x[0], x[1] if len(x) == 2 else None))
	return _download_files(tups, dest, concurrency, chunk_size, verbose, _timeout)


	if __name__ == '__main__':
	import argparse

	def _valid_dir(maybe_dir):
	if not os.path.isdir(maybe_dir):
	raise argparse.ArgumentTypeError('%s is not a valid directory' % maybe_dir)
	if not os.access(maybe_dir, os.W_OK):
	raise argparse.ArgumentTypeError('Cannot write to %s' % maybe_dir)
	return maybe_dir

	parser = argparse.ArgumentParser()
	parser.add_argument('urls', type=argparse.FileType('r'),
	help='File with a url on each line, optionally with a target filename separated by a comma')
	parser.add_argument('dest', nargs='?', default=os.getcwd(), type=_valid_dir,
	help='Destination directory to download the files into (defaults to current)')
	parser.add_argument('-t', '--threads', type=int, default=10,
	help='Number of parallel threads to run (default=%(default)s)')
	parser.add_argument('-c', '--chunk-size', type=int, default=128*1024,
	help='Size of chunk to download at a time, in bytes (default=%(default)s)')

	args = parser.parse_args()
	download_files(list(csv.reader(args.urls)), args.dest, args.threads, args.chunk_size, verbose=True)