Last active
August 1, 2023 21:04
-
-
Save jps3/4d250e46b4aeb2ebafbce2c6040c3cce to your computer and use it in GitHub Desktop.
Downloads PDF files from links on https://macadmins.psu.edu/conference/resources/
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env/python3 | |
URL = "https://macadmins.psu.edu/conference/resources/" | |
LINK_TEXT = "Slides" | |
# ex. DOWNLOAD_PATH = "/Users/me/Documents/Sysadmin/Conferences/2023/PSU MacAdmins/Slides" | |
DOWNLOAD_PATH = "" | |
import os | |
import requests | |
import sys | |
from bs4 import BeautifulSoup | |
from pathlib import Path | |
from tqdm import tqdm | |
from urllib.parse import urlparse, unquote | |
def main(): | |
if not len(DOWNLOAD_PATH): | |
DOWNLOAD_PATH = Path.home() / Path("Downloads") # Default: ~/Downloads | |
doc = requests.get(URL) | |
soup = BeautifulSoup(doc.content, "html.parser") | |
found_links = [ | |
a.get("href") | |
for a in soup.find_all(lambda tag: tag.name == "a" and LINK_TEXT in tag) | |
] | |
missing_links = [] | |
print("Found {} links.".format(len(found_links))) | |
for href in found_links: | |
download_filename = unquote(Path(urlparse(href).path).name) | |
download_path = Path(DOWNLOAD_PATH) / download_filename | |
existing_file_size = 0 | |
try: | |
existing_file_size = os.stat(download_path).st_size | |
except FileNotFoundError: | |
pass # This is ok, we can ignore this | |
req = requests.get(href, stream=True) | |
total_size_in_bytes = int(req.headers.get("Content-Length", 0)) | |
block_size = 1024 # 1 KiB | |
if req.status_code == 200: | |
if existing_file_size != total_size_in_bytes: | |
progress_bar = tqdm(total=total_size_in_bytes, unit="iB", unit_scale=True) | |
with open(download_path, "wb") as file: | |
progress_bar.set_description(download_filename) | |
for data in req.iter_content(block_size): | |
progress_bar.update(len(data)) | |
file.write(data) | |
progress_bar.close() | |
if total_size_in_bytes != 0 and progress_bar.n != total_size_in_bytes: | |
print("ERROR: Something went wrong") | |
# TODO: ask user to exit (w/ default timeout and action?) | |
else: | |
print('File {0:s} exists. Skipping.'.format(download_filename), file=sys.stderr) | |
else: | |
missing_links.append(href) | |
print( | |
"Error: HTTP code {0:>3d} returned for '{1:s}'".format( | |
req.status_code, href | |
), | |
file=sys.stderr, | |
) | |
req.close() | |
if missing_links: | |
print("Missing links (did not return HTTP 200 on request):") | |
print("\n".join(missing_links)) | |
if __name__ == "__main__": | |
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
beautifulsoup4 >= 4.12.2 | |
requests >= 2.23.0 | |
tqdm >= 4.65.0 | |
urllib3 >= 2.0.4 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment