How to download all files with certain extensions from the same webpage with Python
#!/usr/bin/env python3 | |
# -*- coding: utf-8 -*- | |
from __future__ import (division, absolute_import, print_function, unicode_literals) | |
import sys, os, argparse, time | |
from bs4 import BeautifulSoup | |
# from: https://stackoverflow.com/a/16518224/6332373 | |
if sys.version_info >= (3,): | |
import urllib.request as urllib2 | |
import urllib.parse as urlparse | |
else: | |
import urllib2 | |
import urlparse | |
def download_file(url, dest=None): | |
""" | |
Download and save a file specified by url to dest directory, | |
""" | |
u = urllib2.urlopen(url) | |
scheme, netloc, path, query, fragment = urlparse.urlsplit(url) | |
filename = os.path.basename(path) | |
if not filename: | |
filename = 'downloaded.file' | |
if dest: | |
filename = os.path.join(dest, filename) | |
with open(filename, 'wb') as f: | |
meta = u.info() | |
meta_func = meta.getheaders if hasattr(meta, 'getheaders') else meta.get_all | |
meta_length = meta_func("Content-Length") | |
file_size = None | |
if meta_length: | |
file_size = int(meta_length[0]) | |
print("Downloading: {0} Bytes: {1}".format(url, file_size)) | |
file_size_dl = 0 | |
block_sz = 8192 | |
while True: | |
buffer = u.read(block_sz) | |
if not buffer: | |
break | |
file_size_dl += len(buffer) | |
f.write(buffer) | |
status = "{0:16}".format(file_size_dl) | |
if file_size: | |
status += " [{0:6.2f}%]".format(file_size_dl * 100 / file_size) | |
status += chr(13) | |
print(status, end="") | |
print() | |
return filename | |
def collect_all_url(page_url, extensions): | |
""" | |
Recovers all links in page_url checking for all the desired extensions | |
""" | |
conn = urllib2.urlopen(page_url) | |
html = conn.read() | |
soup = BeautifulSoup(html, 'lxml') | |
links = soup.find_all('a') | |
results = [] | |
for tag in links: | |
link = tag.get('href', None) | |
if link is not None: | |
for e in extensions: | |
if e in link: | |
# Fallback for badly defined links | |
# checks for missing scheme or netloc | |
if bool(urlparse.urlparse(link).scheme) and bool(urlparse.urlparse(link).netloc): | |
results.append(link) | |
else: | |
new_url=urlparse.urljoin(page_url,link) | |
results.append(new_url) | |
return results | |
if __name__ == "__main__": # Only run if this file is called directly | |
# Command line arguments | |
parser = argparse.ArgumentParser( | |
description='Download all files from a webpage.') | |
parser.add_argument( | |
'-u', '--url', | |
help='Page url to request') | |
parser.add_argument( | |
'-e', '--ext', | |
nargs='+', | |
help='Extension(s) to find') | |
parser.add_argument( | |
'-d', '--dest', | |
default=None, | |
help='Destination where to save the files') | |
parser.add_argument( | |
'-p', '--par', | |
action='store_true', default=False, | |
help="Turns on parallel download") | |
args = parser.parse_args() | |
t1 = time.time() | |
# Recover files to download | |
all_links = collect_all_url(args.url, args.ext) | |
# Download | |
if not args.par: | |
for l in all_links: | |
try: | |
filename = download_file(l, args.dest) | |
print(l) | |
except Exception as e: | |
print("Error while downloading: {}".format(e)) | |
else: | |
# from: https://markhneedham.com/blog/2018/07/15/python-parallel-download-files-requests/ | |
from multiprocessing.pool import ThreadPool | |
results = ThreadPool(10).imap_unordered( | |
lambda x: download_file(x, args.dest), all_links) | |
for p in results: | |
print(p) | |
t2 = time.time() | |
print("Elapsed time: {}".format(t2-t1)) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This comment has been minimized.
Try it: