Skip to content

Instantly share code, notes, and snippets.

Created June 9, 2011 17:41
Show Gist options
  • Save chandlerprall/1017266 to your computer and use it in GitHub Desktop.
Save chandlerprall/1017266 to your computer and use it in GitHub Desktop.
Small Python multi-threaded file downloader
import urllib2
import threading
from Queue import Queue
import sys, os, re
class ThreadedDownload(object):
'hostname_strip':re.compile('.*\..*?/', re.I)
class MissingDirectoryException(Exception):
class Downloader(threading.Thread):
def __init__(self, queue, report):
self.queue = queue = report
def run(self):
while self.queue.empty() == False:
url = self.queue.get()
response =
if response == False and url.url_tried < url.url_tries:
elif response == False and url.url_tried == url.url_tries:['failure'].append(url)
elif response == True:['success'].append(url)
class URLTarget(object):
def __init__(self, url, destination, url_tries):
self.url = url
self.destination = destination
self.url_tries = url_tries
self.url_tried = 0
self.success = False
self.error = None
def download(self):
self.url_tried = self.url_tried + 1
if os.path.exists(self.destination): # This file has already been downloaded
self.success = True
return self.success
remote_file = urllib2.urlopen(self.url)
package =
if os.path.exists(os.path.dirname(self.destination)) == False:
dest_file = open(self.destination, 'wb')
self.success = True
except Exception, e:
self.error = e
return self.success
def __str__(self):
return 'URLTarget (%(url)s, %(success)s, %(error)s)' % {'url':self.url, 'success':self.success, 'error':self.error}
def __init__(self, urls=[], destination='.', directory_structure=False, thread_count=5, url_tries=3):
if os.path.exists(destination) == False:
raise ThreadedDownload.MissingDirectoryException('Destination folder does not exist.')
self.queue = Queue(0) # Infinite sized queue = {'success':[],'failure':[]}
self.threads = []
if destination[-1] != os.path.sep:
destination = destination + os.path.sep
self.destination = destination
self.thread_count = thread_count
self.directory_structure = directory_structure
# Prepopulate queue with any values we were given
for url in urls:
self.queue.put(ThreadedDownload.URLTarget(url, self.fileDestination(url), url_tries))
def fileDestination(self, url):
if self.directory_structure == False:
# No directory structure, just filenames
file_destination = '%s%s' % (self.destination, os.path.basename(url))
elif self.directory_structure == True:
# Strip off hostname, keep all other directories
file_destination = '%s%s' % (self.destination, ThreadedDownload.REGEX['hostname_strip'].sub('', url))
elif hasattr(self.directory_structure, '__len__') and len(self.directory_structure) == 2:
# User supplied a custom regex replace
regex = self.directory_structure[0]
if instanceof(regex, str):
regex = re.compile(str)
replace = self.directory_structure[1]
file_destination = '%s%s' % (self.destination, regex.sub(replace, url))
# No idea what's wanted
file_destination = None
if hasattr(file_destination, 'replace'):
file_destination = file_destination.replace('/', os.path.sep)
return file_destination
def addTarget(self, url, url_tries=3):
self.queue.put(ThreadedDownload.URLTarget(url, self.fileDestination(url), url_tries))
def run(self):
for i in range(self.thread_count):
thread = ThreadedDownload.Downloader(self.queue,
if self.queue.qsize() > 0:
if __name__ == "__main__":
if len(sys.argv) == 1:
print 'No source URLs given.'
url_source_path = sys.argv[1]
if not os.path.exists(url_source_path):
print '`%s` not found.' % url_source_path
# Load urls
url_source = open(url_source_path, 'r')
urls = [url.strip() for url in url_source.readlines()]
# Download destination
if len(sys.argv) >= 3:
destination = sys.argv[2]
if not os.path.exists(destination):
print 'Destination `%s` does not exist.'
destination = '.'
# Number of threads
if len(sys.argv) >= 4:
threads = int(sys.argv[3])
threads = 5
downloader = ThreadedDownload(urls, destination, True, threads, 3)
print 'Downloading %s files' % len(urls)
print 'Downloaded %(success)s of %(total)s' % {'success': len(['success']), 'total': len(urls)}
if len(['failure']) > 0:
print '\nFailed urls:'
for url in['failure']:
print url
Copy link

well done man, will this work on appengine? i need a the php version of this script to make few free php scripts for my free php script download site.

Copy link

This only works for files which fit into memory.

Copy link

Is it possible to use multi proxies for each thread?

Copy link

alik604 commented Nov 18, 2019

For files that don't fit into memory.

from urllib.request import urlopen
response = urlopen(url)
with open('./myFile.pdf', 'wb') as f:
        while True:
            chunk =
            if not chunk:

Copy link

How do you use it?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment