Skip to content

Instantly share code, notes, and snippets.

@firxworx
Created February 20, 2018 23:53
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save firxworx/aca78fb34e31264b76bf411b6464464f to your computer and use it in GitHub Desktop.
Save firxworx/aca78fb34e31264b76bf411b6464464f to your computer and use it in GitHub Desktop.
Web scraping with python: download all files linked to from a given web page with BeautifulSoup, urllib, and shutil
import urllib
import shutil
import re
from pathlib import Path
from bs4 import BeautifulSoup
# target page containing links to the image files
target_page = 'http://example.ca/image_links.php'
# local path
dest_path = '/Volumes/ArrayRAID/scraped/images'
# NOTE: this implementation (easily modified) assumes link hrefs contain absolute
# URL's with 'http://' protocol prefix e.g. http://example.com/dir/image.jpg and that
# all links on the target_page point to desired image files.
img_urls = []
page = urllib.request.urlopen(target_page).read()
soup = BeautifulSoup(page, 'html.parser')
for link in soup.findAll('a', attrs={'href': re.compile("^http://")}):
img_urls.append(link.get('href'))
counter = 1
for img_url in img_urls:
img_filename = Path(img_url).name
img_dest = dest_path + '/' + img_filename
# recreate url with a url-encoded img_filename to handle whitespace in filenames
img_url_clean = img_url.rsplit('/', 1)[0] + '/' + urllib.parse.quote(img_filename)
print(str(counter) + ":\t " + img_dest)
counter += 1
with urllib.request.urlopen(img_url_clean) as response, open(img_dest, 'wb') as out_file:
shutil.copyfileobj(response, out_file)
#if counter > 4:
# break
print("DONE!")
print("Saved " + str(counter - 1) + " files.")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment