Skip to content

Instantly share code, notes, and snippets.

@cboulanger
Last active May 28, 2024 16:52
Show Gist options
  • Save cboulanger/bd9300ad4481d29b87c3b35a5fcc798c to your computer and use it in GitHub Desktop.
Save cboulanger/bd9300ad4481d29b87c3b35a5fcc798c to your computer and use it in GitHub Desktop.
Download a pad.gwdg.de presentation for local offline viewing. Script written with the help of ChatGPT4
import os
import re
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import argparse
def download_resource(session, url, base_dir, level=0):
try:
response = session.get(url, stream=True)
response.raise_for_status()
# Check the content type to ensure it's not an HTML error page
content_type = response.headers.get('Content-Type', '')
content_is_code = content_type.split("; ")[0] in ['application/javascript', 'text/javascript', 'text/css']
if 'text/html' in content_type and not url.endswith(".html"):
print(f"Skipped downloading {url}: MIME type is text/html (likely a 404 page)")
return
parsed_url = urlparse(url)
resource_path = os.path.join(base_dir, parsed_url.path.lstrip('/'))
resource_dir = os.path.dirname(resource_path)
os.makedirs(resource_dir, exist_ok=True)
print(f'{" "*level}Saving {parsed_url.path} to {resource_path}')
with open(resource_path, 'wb') as f:
content = b""
for chunk in response.iter_content(chunk_size=8192):
# Monkey-patch the javascript
if resource_path.endswith('slide-pack.9fe42901cee029fba75d.js'):
chunk_str = chunk.decode('utf-8')
chunk_str = chunk_str.replace('src:serverurl+"/build/', 'src:"build/')
chunk = chunk_str.encode('utf-8')
content += chunk
f.write(chunk)
# If the downloaded file is JavaScript or CSS, check for more URLs
if content_is_code:
content_str = content.decode('utf-8')
matches = re.findall(r'https://pad\.gwdg\.de/[^\'"\s\)\]]+', content_str)
matches += ['https://pad.gwdg.de/' + p for p in re.findall(r'/(build|css|js)/[^\'"\s\)\]]+', content_str)]
for match in matches:
if not match.endswith("/"):
download_resource(session, match, base_dir, level+1)
except Exception as e:
print(f"{" "*level}Failed to download {url}: {e}")
def remove_csp(soup):
for meta in soup.find_all("meta"):
if 'http-equiv' in meta.attrs and meta.attrs['http-equiv'] == 'Content-Security-Policy':
meta.decompose()
def replace_and_download_resources(session, soup, base_url, base_dir):
for tag in soup.find_all(['img', 'link', 'script']):
if tag.name == 'img' and tag.get('src'):
resource_url = urljoin(base_url, tag['src'])
download_resource(session, resource_url, base_dir)
elif tag.name == 'link' and tag.get('href'):
resource_url = urljoin(base_url, tag['href'])
download_resource(session, resource_url, base_dir)
elif tag.name == 'script' and tag.get('src'):
resource_url = urljoin(base_url, tag['src'])
download_resource(session, resource_url, base_dir)
# Handle dynamically generated URLs in script content
for script in soup.find_all('script'):
if script.string:
updated_script = script.string
matches = re.findall(r'https://pad\.gwdg\.de/([^\'"\s]+)', updated_script)
for match in matches:
resource_url = urljoin(base_url, match)
download_resource(session, resource_url, base_dir)
updated_script = updated_script.replace(resource_url, match)
script.string.replace_with(updated_script)
def download_uploads_resources(session, html_str, base_dir):
matches = re.findall(r'https://pad\.gwdg\.de/uploads/[^\s\'\"\)\]]+', html_str)
for match in matches:
resource_url = match
download_resource(session, resource_url, base_dir)
def download_additional_resources(session, base_url, base_dir, additional_paths):
for path in additional_paths:
resource_url = urljoin(base_url, path)
download_resource(session, resource_url, base_dir)
def download_html_and_resources(slide_id, output_html, base_dir, additional_paths):
base_url = "https://pad.gwdg.de"
page_url = f"{base_url}/p/{slide_id}"
session = requests.Session()
response = session.get(page_url)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
remove_csp(soup) # Remove CSP settings
# Replace occurrences of the base URL with the local paths and download resources
replace_and_download_resources(session, soup, base_url, base_dir)
# Convert the soup object to string to find and download uploads resources
html_str = str(soup)
download_uploads_resources(session, html_str, base_dir)
# Download additional specified resources
download_additional_resources(session, base_url, base_dir, additional_paths)
# Convert all occurrences of the base URL to local paths
html_str = html_str.replace(base_url + '/', './')
# Ensure the base directory exists
os.makedirs(base_dir, exist_ok=True)
# Save the modified HTML
with open(os.path.join(base_dir, output_html), 'w', encoding='utf-8') as f:
f.write(html_str)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Download and save HTML and resources from pad.gwdg.de")
parser.add_argument('-i', '--id', required=True, help="The slide ID from pad.gwdg.de")
parser.add_argument('-d', '--dir', required=True, help="The directory where the resources will be saved")
args = parser.parse_args()
slide_id = args.id
base_dir = args.dir
output_html = "index.html"
# this is a manual collection of resources that were not discovered by the script, probably incomplete
additional_paths = [
"build/85934a8a31bd9b8b75e68eeb57b6859810055d48742953766c4a5c2b5a0d5266.woff",
"build/8810ba3440bf482ced33d2f74b7803bba711f689d8e4caa7da5c6ae6844a1b49.woff2",
"build/006708d6691753cfc46eec2dae88fbdafa22823a89194149d9f223050dc78998.woff",
"build/4f319287827e35f841069eb471c092eccf97d2f7830aa4d8bd7301ded418bf49.ttf",
"build/MathJax/jax/input/TeX/config.js?V=2.7.9",
"build/MathJax/jax/input/MathML/config.js?V=2.7.9",
"build/MathJax/jax/output/HTML-CSS/config.js?V=2.7.9",
"build/MathJax/jax/output/NativeMML/config.js?V=2.7.9",
"build/MathJax/jax/output/PreviewHTML/config.js?V=2.7.9",
"build/MathJax/extensions/tex2jax.js?V=2.7.9",
"build/MathJax/extensions/mml2jax.js?V=2.7.9",
"build/MathJax/extensions/MathEvents.js?V=2.7.9",
"build/MathJax/extensions/MathZoom.js?V=2.7.9",
"build/MathJax/extensions/MathMenu.js?V=2.7.9",
"build/MathJax/extensions/toMathML.js?V=2.7.9",
"build/MathJax/extensions/TeX/noErrors.js?V=2.7.9",
"build/MathJax/extensions/TeX/noUndefined.js?V=2.7.9",
"build/MathJax/extensions/TeX/AMSmath.js?V=2.7.9",
"build/MathJax/extensions/TeX/AMSsymbols.js?V=2.7.9",
"build/MathJax/extensions/fast-preview.js?V=2.7.9",
"build/MathJax/extensions/AssistiveMML.js?V=2.7.9",
"build/MathJax/extensions/a11y/accessibility-menu.js?V=2.7.9",
"build/MathJax/extensions/Safe.js?V=2.7.9",
"build/29.5f5bdb9120d6b9c39930.js",
"build/27.fbb6b5bbda6765f0a1f1.js",
"build/reveal.js/plugin/notes/notes.js",
"build/reveal.js/css/theme/white.css",
"build/reveal.js/lib/font/source-sans-pro/source-sans-pro.css",
"build/reveal.js/css/print/paper.css",
"build/reveal.js/lib/font/source-sans-pro/source-sans-pro-regular.ttf",
"build/reveal.js/lib/font/source-sans-pro/source-sans-pro-regular.woff"
]
download_html_and_resources(slide_id, output_html, base_dir, additional_paths)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment