Skip to content

Instantly share code, notes, and snippets.

@jps3
Last active August 1, 2023 21:04
Show Gist options
  • Save jps3/4d250e46b4aeb2ebafbce2c6040c3cce to your computer and use it in GitHub Desktop.
Save jps3/4d250e46b4aeb2ebafbce2c6040c3cce to your computer and use it in GitHub Desktop.
Downloads PDF files from links on https://macadmins.psu.edu/conference/resources/
#!/usr/bin/env/python3
URL = "https://macadmins.psu.edu/conference/resources/"
LINK_TEXT = "Slides"
# ex. DOWNLOAD_PATH = "/Users/me/Documents/Sysadmin/Conferences/2023/PSU MacAdmins/Slides"
DOWNLOAD_PATH = ""
import os
import requests
import sys
from bs4 import BeautifulSoup
from pathlib import Path
from tqdm import tqdm
from urllib.parse import urlparse, unquote
def main():
if not len(DOWNLOAD_PATH):
DOWNLOAD_PATH = Path.home() / Path("Downloads") # Default: ~/Downloads
doc = requests.get(URL)
soup = BeautifulSoup(doc.content, "html.parser")
found_links = [
a.get("href")
for a in soup.find_all(lambda tag: tag.name == "a" and LINK_TEXT in tag)
]
missing_links = []
print("Found {} links.".format(len(found_links)))
for href in found_links:
download_filename = unquote(Path(urlparse(href).path).name)
download_path = Path(DOWNLOAD_PATH) / download_filename
existing_file_size = 0
try:
existing_file_size = os.stat(download_path).st_size
except FileNotFoundError:
pass # This is ok, we can ignore this
req = requests.get(href, stream=True)
total_size_in_bytes = int(req.headers.get("Content-Length", 0))
block_size = 1024 # 1 KiB
if req.status_code == 200:
if existing_file_size != total_size_in_bytes:
progress_bar = tqdm(total=total_size_in_bytes, unit="iB", unit_scale=True)
with open(download_path, "wb") as file:
progress_bar.set_description(download_filename)
for data in req.iter_content(block_size):
progress_bar.update(len(data))
file.write(data)
progress_bar.close()
if total_size_in_bytes != 0 and progress_bar.n != total_size_in_bytes:
print("ERROR: Something went wrong")
# TODO: ask user to exit (w/ default timeout and action?)
else:
print('File {0:s} exists. Skipping.'.format(download_filename), file=sys.stderr)
else:
missing_links.append(href)
print(
"Error: HTTP code {0:>3d} returned for '{1:s}'".format(
req.status_code, href
),
file=sys.stderr,
)
req.close()
if missing_links:
print("Missing links (did not return HTTP 200 on request):")
print("\n".join(missing_links))
if __name__ == "__main__":
main()
beautifulsoup4 >= 4.12.2
requests >= 2.23.0
tqdm >= 4.65.0
urllib3 >= 2.0.4
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment