Skip to content

Instantly share code, notes, and snippets.

@linuskmr
Last active October 1, 2022 18:20
Show Gist options
  • Save linuskmr/3cf4122dd8686aacb9c235d345da5311 to your computer and use it in GitHub Desktop.
Save linuskmr/3cf4122dd8686aacb9c235d345da5311 to your computer and use it in GitHub Desktop.
# Downloads all hrefs from a webpage
import os
import re
import urllib
from pathlib import Path
from multiprocessing import Pool
import httpx
WEBPAGE_URL = "https://google.com/foo/bar/"
"""URL to the webpage to download hrefs."""
GET_KWARGS = {
# 'auth': ('user', 'password'),
# 'headers': {
# 'User-Agent': 'httpx/0.23.0'
# }
}
"""Additions arguments for requests."""
DOWNLOAD_DIR = Path('./downloads')
"""Directory to save the downloaded files in."""
# Create download directory if not exists yet
os.makedirs(DOWNLOAD_DIR, exist_ok=True)
# Get the webpage
webpage = httpx.get(WEBPAGE_URL, **GET_KWARGS).text
# regex that finds href in a tag
href_regex = re.compile(r'href=[\"\'](.*?)[\"\']')
# Search for all hrefs in the webpage
hrefs = href_regex.findall(webpage)
print(f"Found {len(hrefs)} hrefs")
def download_href(href: str):
# Make the link absolute
href = urllib.parse.urljoin(WEBPAGE_URL, href)
# We are only interested in the hrefs to PDFs, videos and ZIPs
if not href.endswith('.pdf') and not href.endswith('.mp4') and not href.endswith('.zip'):
# print(f"Skipping {href}")
return
# Get filename (last part of the URL)
filename = href.split('/')[-1]
file_path = DOWNLOAD_DIR / filename
# Check if file is already downloaded
if os.path.exists(file_path):
print(f"File {filename} already exists, skipping")
return
print(f'Downloading {href}')
web_file_response = httpx.get(href, **GET_KWARGS)
web_file_response.raise_for_status() # Raise exception if status code is not 2xx
# Save the file to the filesystem
print(f'Saving {href} to {file_path}')
with open(file_path, 'wb') as f:
f.write(web_file_response.content)
# Execute the download in parallel
with Pool(processes=4) as executor:
result_iter = executor.map(download_href, hrefs)
# Exceptions from the threads are not propagated to the main thread,
# unless the results are retrieved from the iterator, so get the exceptions here
results = list(result_iter)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment