Skip to content

Instantly share code, notes, and snippets.

@charles-l
Last active March 2, 2024 19:57
Show Gist options
  • Save charles-l/4c0b22d5b37768efa0a8351e4ce133e7 to your computer and use it in GitHub Desktop.
Save charles-l/4c0b22d5b37768efa0a8351e4ce133e7 to your computer and use it in GitHub Desktop.
A python script to save the Firefox Reader view of a page with images. Kind of a personal archive.org tool but using zip and HTML files rather than WARC.
#!/usr/bin/env python3
from bs4 import BeautifulSoup
from readability import Document
import click
from click import echo
import requests
import slugify
import os
import os.path
import urllib.parse
import tempfile
import zipfile
import datetime
@click.command()
@click.argument('url')
def save_page(url):
with tempfile.TemporaryDirectory() as tempdir:
response = requests.get(url)
doc = Document(response.text)
nice_name = slugify.slugify(doc.title())
doc_dom = BeautifulSoup(doc.summary(), features='lxml')
# save images
for i, img in enumerate(doc_dom.find_all('img')):
img_url = urllib.parse.urlparse(img['src'])
_, extension = os.path.splitext(img_url.path)
if not extension:
echo('No file extension for img src, leaving as is: ' + img_url.geturl(), err=True)
else:
if img_url.hostname:
img_resp = requests.get(img_url.geturl())
else:
img_resp = requests.get(urllib.parse.urljoin(url, img_url.geturl()))
assert img_resp.ok
saved_path = f'{i}{extension}'
with open(os.path.join(tempdir, saved_path), 'wb') as f:
f.write(img_resp.content)
img['src'] = saved_path
# update relative links to point at old content
for a in doc_dom.find_all('a'):
if 'href' not in a:
echo('Malformed a tag - no href - skipping: ' + str(a), err=True)
continue
a_url = urllib.parse.urlparse(a['href'])
if not a_url.hostname:
a['href'] = urllib.parse.urljoin(url, a['href'])
with open(os.path.join(tempdir, nice_name + '.html'), 'w') as f:
f.write(str(doc_dom))
with zipfile.ZipFile(nice_name + '.zip', 'w') as zipf:
zipf.writestr(os.path.join(nice_name, 'meta'), '\n'.join([url, str(datetime.datetime.utcnow()) + ' UTC']))
for p in os.listdir(tempdir):
zipf.write(os.path.join(tempdir, p), arcname=os.path.join(nice_name, p))
if __name__ == '__main__':
save_page()
@charles-l
Copy link
Author

Maybe TODO:

  • Make it faster by fetching images in parallel? Might be useful for image heavy pages
  • Fix URLs for other elements on the page besides just images and anchors?
  • More testing (test on elements like tables, videos, weird layouts, etc). Mostly targeted at scraping personal web pages so I'm not sure how useful this can be.
  • Potentially include JavaScript resources for interactive explanation-y blog posts? I'm not sure if readability would remove those elements (justifiably)
  • Go crazy and implement an indexer and full text search. Maybe a full personalized archive.org thing that lives in a single SQLite database? Kinda like recoll but simpler?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment