Skip to content

Instantly share code, notes, and snippets.

@vijaiaeroastro
Forked from humbhenri/get-images.py
Created November 13, 2015 09:55
Show Gist options
  • Save vijaiaeroastro/6a3f4986a383b367b463 to your computer and use it in GitHub Desktop.
Save vijaiaeroastro/6a3f4986a383b367b463 to your computer and use it in GitHub Desktop.
Download images in parallel using python
from BeautifulSoup import BeautifulSoup
from functools import partial
from urllib import urlretrieve
from urlparse import urljoin, urlparse
import argparse
import multiprocessing
import os
import sys
import urllib2
def get_url():
parser = argparse.ArgumentParser(description='Download images from a website')
parser.add_argument('-w', type=str, help='website url')
options = parser.parse_args()
if options.w is None:
print parser.print_help()
sys.exit()
return options.w
def save_image(image, site_url, folder):
filename = image['src'].split('/')[-1]
outpath = os.path.join(folder, filename)
url = urljoin(site_url, image['src'])
urlretrieve(url, outpath)
print 'Saved image ' + outpath
if __name__ == '__main__':
url = get_url()
os.mkdir(urlparse(url).netloc)
page = BeautifulSoup(urllib2.urlopen(url).read())
images = page.findAll('img')
pool = multiprocessing.Pool()
partial_save_image = partial(save_image, site_url=url, folder=urlparse(url).netloc)
pool.map(partial_save_image, images)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment