linuskmr/website_downloader.py

## website_downloader.py
# Downloads all hrefs from a webpage

import os
import re
import urllib
from pathlib import Path
from multiprocessing import Pool

import httpx


WEBPAGE_URL = "https://google.com/foo/bar/"
"""URL to the webpage to download hrefs."""


GET_KWARGS = {
    # 'auth': ('user', 'password'),
    # 'headers': {
    #     'User-Agent': 'httpx/0.23.0'
    # }
}
"""Additions arguments for requests."""


DOWNLOAD_DIR = Path('./downloads')
"""Directory to save the downloaded files in."""


# Create download directory if not exists yet
os.makedirs(DOWNLOAD_DIR, exist_ok=True)

# Get the webpage
webpage = httpx.get(WEBPAGE_URL, **GET_KWARGS).text

# regex that finds href in a tag
href_regex = re.compile(r'href=[\"\'](.*?)[\"\']')

# Search for all hrefs in the webpage
hrefs = href_regex.findall(webpage)
print(f"Found {len(hrefs)} hrefs")


def download_href(href: str):
    # Make the link absolute
    href = urllib.parse.urljoin(WEBPAGE_URL, href)

    # We are only interested in the hrefs to PDFs, videos and ZIPs
    if not href.endswith('.pdf') and not href.endswith('.mp4') and not href.endswith('.zip'):
        # print(f"Skipping {href}")
        return

    # Get filename (last part of the URL)
    filename = href.split('/')[-1]
    file_path = DOWNLOAD_DIR / filename

    # Check if file is already downloaded
    if os.path.exists(file_path):
        print(f"File {filename} already exists, skipping")
        return

    print(f'Downloading {href}')
    web_file_response = httpx.get(href, **GET_KWARGS)
    web_file_response.raise_for_status()  # Raise exception if status code is not 2xx

    # Save the file to the filesystem
    print(f'Saving {href} to {file_path}')
    with open(file_path, 'wb') as f:
        f.write(web_file_response.content)


# Execute the download in parallel
with Pool(processes=4) as executor:
    result_iter = executor.map(download_href, hrefs)
    # Exceptions from the threads are not propagated to the main thread,
    # unless the results are retrieved from the iterator, so get the exceptions here
    results = list(result_iter)
	# Downloads all hrefs from a webpage

	import os
	import re
	import urllib
	from pathlib import Path
	from multiprocessing import Pool

	import httpx


	WEBPAGE_URL = "https://google.com/foo/bar/"
	"""URL to the webpage to download hrefs."""


	GET_KWARGS = {
	# 'auth': ('user', 'password'),
	# 'headers': {
	# 'User-Agent': 'httpx/0.23.0'
	# }
	}
	"""Additions arguments for requests."""


	DOWNLOAD_DIR = Path('./downloads')
	"""Directory to save the downloaded files in."""



	# Create download directory if not exists yet
	os.makedirs(DOWNLOAD_DIR, exist_ok=True)

	# Get the webpage
	webpage = httpx.get(WEBPAGE_URL, **GET_KWARGS).text

	# regex that finds href in a tag
	href_regex = re.compile(r'href=[\"\'](.*?)[\"\']')

	# Search for all hrefs in the webpage
	hrefs = href_regex.findall(webpage)
	print(f"Found {len(hrefs)} hrefs")


	def download_href(href: str):
	# Make the link absolute
	href = urllib.parse.urljoin(WEBPAGE_URL, href)

	# We are only interested in the hrefs to PDFs, videos and ZIPs
	if not href.endswith('.pdf') and not href.endswith('.mp4') and not href.endswith('.zip'):
	# print(f"Skipping {href}")
	return

	# Get filename (last part of the URL)
	filename = href.split('/')[-1]
	file_path = DOWNLOAD_DIR / filename

	# Check if file is already downloaded
	if os.path.exists(file_path):
	print(f"File {filename} already exists, skipping")
	return

	print(f'Downloading {href}')
	web_file_response = httpx.get(href, **GET_KWARGS)
	web_file_response.raise_for_status() # Raise exception if status code is not 2xx

	# Save the file to the filesystem
	print(f'Saving {href} to {file_path}')
	with open(file_path, 'wb') as f:
	f.write(web_file_response.content)


	# Execute the download in parallel
	with Pool(processes=4) as executor:
	result_iter = executor.map(download_href, hrefs)
	# Exceptions from the threads are not propagated to the main thread,
	# unless the results are retrieved from the iterator, so get the exceptions here
	results = list(result_iter)