jps3/psumac-resource-pdf-downloader.py

## psumac-resource-pdf-downloader.py
#!/usr/bin/env/python3

URL = "https://macadmins.psu.edu/conference/resources/"
LINK_TEXT = "Slides"
# ex. DOWNLOAD_PATH = "/Users/me/Documents/Sysadmin/Conferences/2023/PSU MacAdmins/Slides"
DOWNLOAD_PATH = ""

import os
import requests
import sys
from bs4 import BeautifulSoup
from pathlib import Path
from tqdm import tqdm
from urllib.parse import urlparse, unquote


def main():
    if not len(DOWNLOAD_PATH):
        DOWNLOAD_PATH = Path.home() / Path("Downloads") # Default: ~/Downloads
    doc = requests.get(URL)
    soup = BeautifulSoup(doc.content, "html.parser")
    found_links = [
        a.get("href")
        for a in soup.find_all(lambda tag: tag.name == "a" and LINK_TEXT in tag)
    ]
    missing_links = []
    print("Found {} links.".format(len(found_links)))

    for href in found_links:
        download_filename = unquote(Path(urlparse(href).path).name)
        download_path = Path(DOWNLOAD_PATH) / download_filename
        existing_file_size = 0

        try:
            existing_file_size = os.stat(download_path).st_size
        except FileNotFoundError:
            pass  # This is ok, we can ignore this

        req = requests.get(href, stream=True)
        total_size_in_bytes = int(req.headers.get("Content-Length", 0))
        block_size = 1024  # 1 KiB

        if req.status_code == 200:
            if existing_file_size != total_size_in_bytes:
                 progress_bar = tqdm(total=total_size_in_bytes, unit="iB", unit_scale=True)
                 with open(download_path, "wb") as file:
                    progress_bar.set_description(download_filename)
                    for data in req.iter_content(block_size):
                        progress_bar.update(len(data))
                        file.write(data)
                 progress_bar.close()
                 if total_size_in_bytes != 0 and progress_bar.n != total_size_in_bytes:
                     print("ERROR: Something went wrong")
                     # TODO: ask user to exit (w/ default timeout and action?)
            else:
                print('File {0:s} exists. Skipping.'.format(download_filename), file=sys.stderr)

        else:
            missing_links.append(href)
            print(
                "Error: HTTP code {0:>3d} returned for '{1:s}'".format(
                    req.status_code, href
                ),
                file=sys.stderr,
            )

        req.close()

    if missing_links:
        print("Missing links (did not return HTTP 200 on request):")
        print("\n".join(missing_links))


if __name__ == "__main__":
    main()

## requirements.txt
beautifulsoup4 >= 4.12.2
requests >= 2.23.0
tqdm >= 4.65.0
urllib3 >= 2.0.4
	#!/usr/bin/env/python3

	URL = "https://macadmins.psu.edu/conference/resources/"
	LINK_TEXT = "Slides"
	# ex. DOWNLOAD_PATH = "/Users/me/Documents/Sysadmin/Conferences/2023/PSU MacAdmins/Slides"
	DOWNLOAD_PATH = ""

	import os
	import requests
	import sys
	from bs4 import BeautifulSoup
	from pathlib import Path
	from tqdm import tqdm
	from urllib.parse import urlparse, unquote


	def main():
	if not len(DOWNLOAD_PATH):
	DOWNLOAD_PATH = Path.home() / Path("Downloads") # Default: ~/Downloads
	doc = requests.get(URL)
	soup = BeautifulSoup(doc.content, "html.parser")
	found_links = [
	a.get("href")
	for a in soup.find_all(lambda tag: tag.name == "a" and LINK_TEXT in tag)
	]
	missing_links = []
	print("Found {} links.".format(len(found_links)))

	for href in found_links:
	download_filename = unquote(Path(urlparse(href).path).name)
	download_path = Path(DOWNLOAD_PATH) / download_filename
	existing_file_size = 0

	try:
	existing_file_size = os.stat(download_path).st_size
	except FileNotFoundError:
	pass # This is ok, we can ignore this

	req = requests.get(href, stream=True)
	total_size_in_bytes = int(req.headers.get("Content-Length", 0))
	block_size = 1024 # 1 KiB

	if req.status_code == 200:
	if existing_file_size != total_size_in_bytes:
	progress_bar = tqdm(total=total_size_in_bytes, unit="iB", unit_scale=True)
	with open(download_path, "wb") as file:
	progress_bar.set_description(download_filename)
	for data in req.iter_content(block_size):
	progress_bar.update(len(data))
	file.write(data)
	progress_bar.close()
	if total_size_in_bytes != 0 and progress_bar.n != total_size_in_bytes:
	print("ERROR: Something went wrong")
	# TODO: ask user to exit (w/ default timeout and action?)
	else:
	print('File {0:s} exists. Skipping.'.format(download_filename), file=sys.stderr)

	else:
	missing_links.append(href)
	print(
	"Error: HTTP code {0:>3d} returned for '{1:s}'".format(
	req.status_code, href
	),
	file=sys.stderr,
	)

	req.close()

	if missing_links:
	print("Missing links (did not return HTTP 200 on request):")
	print("\n".join(missing_links))


	if __name__ == "__main__":
	main()
	beautifulsoup4 >= 4.12.2
	requests >= 2.23.0
	tqdm >= 4.65.0
	urllib3 >= 2.0.4