charles-l/save_page.py

## save_page.py
#!/usr/bin/env python3

from bs4 import BeautifulSoup
from readability import Document
import click
from click import echo
import requests
import slugify

import os
import os.path
import urllib.parse
import tempfile
import zipfile
import datetime

@click.command()
@click.argument('url')
def save_page(url):
    with tempfile.TemporaryDirectory() as tempdir:
        response = requests.get(url)
        doc = Document(response.text)
        nice_name = slugify.slugify(doc.title())
        doc_dom = BeautifulSoup(doc.summary(), features='lxml')

        # save images
        for i, img in enumerate(doc_dom.find_all('img')):
            img_url = urllib.parse.urlparse(img['src'])
            _, extension = os.path.splitext(img_url.path)
            if not extension:
                echo('No file extension for img src, leaving as is: ' + img_url.geturl(), err=True)
            else:
                if img_url.hostname:
                    img_resp = requests.get(img_url.geturl())
                else:
                    img_resp = requests.get(urllib.parse.urljoin(url, img_url.geturl()))

                assert img_resp.ok

                saved_path = f'{i}{extension}'
                with open(os.path.join(tempdir, saved_path), 'wb') as f:
                    f.write(img_resp.content)

                img['src'] = saved_path

        # update relative links to point at old content
        for a in doc_dom.find_all('a'):
            if 'href' not in a:
                echo('Malformed a tag - no href - skipping: ' + str(a), err=True)
                continue
            a_url = urllib.parse.urlparse(a['href'])
            if not a_url.hostname:
                a['href'] = urllib.parse.urljoin(url, a['href'])

        with open(os.path.join(tempdir, nice_name + '.html'), 'w') as f:
            f.write(str(doc_dom))

        with zipfile.ZipFile(nice_name + '.zip', 'w') as zipf:
            zipf.writestr(os.path.join(nice_name, 'meta'), '\n'.join([url, str(datetime.datetime.utcnow()) + ' UTC']))
            for p in os.listdir(tempdir):
                zipf.write(os.path.join(tempdir, p), arcname=os.path.join(nice_name, p))

if __name__ == '__main__':
    save_page()
	#!/usr/bin/env python3

	from bs4 import BeautifulSoup
	from readability import Document
	import click
	from click import echo
	import requests
	import slugify

	import os
	import os.path
	import urllib.parse
	import tempfile
	import zipfile
	import datetime

	@click.command()
	@click.argument('url')
	def save_page(url):
	with tempfile.TemporaryDirectory() as tempdir:
	response = requests.get(url)
	doc = Document(response.text)
	nice_name = slugify.slugify(doc.title())
	doc_dom = BeautifulSoup(doc.summary(), features='lxml')

	# save images
	for i, img in enumerate(doc_dom.find_all('img')):
	img_url = urllib.parse.urlparse(img['src'])
	_, extension = os.path.splitext(img_url.path)
	if not extension:
	echo('No file extension for img src, leaving as is: ' + img_url.geturl(), err=True)
	else:
	if img_url.hostname:
	img_resp = requests.get(img_url.geturl())
	else:
	img_resp = requests.get(urllib.parse.urljoin(url, img_url.geturl()))

	assert img_resp.ok

	saved_path = f'{i}{extension}'
	with open(os.path.join(tempdir, saved_path), 'wb') as f:
	f.write(img_resp.content)

	img['src'] = saved_path

	# update relative links to point at old content
	for a in doc_dom.find_all('a'):
	if 'href' not in a:
	echo('Malformed a tag - no href - skipping: ' + str(a), err=True)
	continue
	a_url = urllib.parse.urlparse(a['href'])
	if not a_url.hostname:
	a['href'] = urllib.parse.urljoin(url, a['href'])

	with open(os.path.join(tempdir, nice_name + '.html'), 'w') as f:
	f.write(str(doc_dom))

	with zipfile.ZipFile(nice_name + '.zip', 'w') as zipf:
	zipf.writestr(os.path.join(nice_name, 'meta'), '\n'.join([url, str(datetime.datetime.utcnow()) + ' UTC']))
	for p in os.listdir(tempdir):
	zipf.write(os.path.join(tempdir, p), arcname=os.path.join(nice_name, p))

	if __name__ == '__main__':
	save_page()