j0057/wardley-maps.py

## wardley-maps.py
#!/usr/bin/env python3.7

# take a bunch of Medium URLs, download them, remove the crap and bundle them up into something for your e-reader

import lxml
import lxml.etree
import lxml.builder
import requests

import io
import os
import os.path

urls = [
    "https://medium.com/wardleymaps/on-being-lost-2ef5f05eb1ec",
    "https://medium.com/wardleymaps/finding-a-path-cdb1249078c0",
    "https://medium.com/wardleymaps/exploring-the-map-ad0266fad59b",
    "https://medium.com/wardleymaps/doctrine-8bb0015688e5",
    "https://medium.com/wardleymaps/the-play-and-a-decision-to-act-8eb796b1dff1",
    "https://medium.com/wardleymaps/getting-started-yourself-e1a359b785a2",
    "https://medium.com/wardleymaps/finding-a-new-purpose-8c60c9484d3b",
    "https://medium.com/wardleymaps/keeping-the-wolves-at-bay-93de21b6b2f8",
    "https://medium.com/wardleymaps/anticipation-89692e9b0ced",
    "https://medium.com/wardleymaps/i-wasnt-expecting-that-dcfe122a2234",
    "https://medium.com/wardleymaps/a-smorgasbord-of-the-slightly-useful-2498a1163dd6",
    "https://medium.com/wardleymaps/the-scenario-8bc05feee81",
    "https://medium.com/wardleymaps/something-wicked-this-way-comes-b028d5c607bf",
    "https://medium.com/wardleymaps/to-thine-own-self-be-true-543cfe2cd1a",
    "https://medium.com/wardleymaps/on-the-practice-of-scenario-planning-49eed8279e90",
    "https://medium.com/wardleymaps/round-round-get-around-i-loop-around-d88e865d4337",
    "https://medium.com/wardleymaps/to-infinity-and-beyond-c7a53ccd2a07",
    "https://medium.com/wardleymaps/better-for-less-58fe8c0a3aaa",
    "https://medium.com/wardleymaps/on-playing-chess-2634b825dbac",
]

result = lxml.etree.parse(io.StringIO('<!DOCTYPE html>\n<html><head><title>Wardley Maps</title></head><body/></html>'), lxml.etree.HTMLParser())

for url in urls:
    fn = os.path.basename(url)
    if not os.path.exists(fn):
        print(f"downloading {url}")
        response = requests.get(url)
        response.raise_for_status()
        with open(fn, "wb") as f:
            f.write(response.content)

    # parse from cache
    with open(fn, "r") as f:
        html = lxml.etree.parse(f, lxml.etree.HTMLParser())

    # find text: it"s everything in the parent of the H1 element
    text = html.xpath("//h1/..")[0]

    # remove the first div with author image buttons and whatnot
    first_div = text.xpath('.//div')[0]
    first_div.getparent().remove(first_div)

    # remove all `name`, `id` and `class` attributes from everything
    for elem in text.xpath("//*"):
        for name in ["id", "name", "class"]:
            if name in elem.attrib:
                del elem.attrib[name]

    # decrapify images
    for figure in text.xpath(".//figure"):
        img = figure.xpath(".//img")[0]
        imgfn = os.path.basename(img.attrib["src"])[2:]
        if not os.path.exists(imgfn):
            print(f"downloading {img.attrib['src']}")
            response = requests.get(img.attrib["src"])
            response.raise_for_status()
            with open(imgfn, "wb") as f:
                f.write(response.content)
        text.replace(figure, lxml.builder.E.img(src=imgfn, style=figure[0].attrib["style"]))

    # keep image captions with image on page break
    for img in text.xpath("./img"):
        prev = img.getprevious()
        text.insert(text.index(prev), lxml.builder.E.div(img, prev, style="break-inside: avoid"))

    # prevent orphans and widows (does not seem to actually work)
    for p in text.xpath("./p"):
        p.attrib["style"] = "orphans: 2; widows: 2"

    # start chapters on new page
    for h1 in text.xpath("./h1")[1:]:
        h1.attrib["style"] = "break-before: always"

    # save content
    result.xpath('/html/body')[0].append(text)

with open("wardley-maps.html", "wb") as f:
    result.write(f, pretty_print=True)
	#!/usr/bin/env python3.7

	# take a bunch of Medium URLs, download them, remove the crap and bundle them up into something for your e-reader

	import lxml
	import lxml.etree
	import lxml.builder
	import requests

	import io
	import os
	import os.path

	urls = [
	"https://medium.com/wardleymaps/on-being-lost-2ef5f05eb1ec",
	"https://medium.com/wardleymaps/finding-a-path-cdb1249078c0",
	"https://medium.com/wardleymaps/exploring-the-map-ad0266fad59b",
	"https://medium.com/wardleymaps/doctrine-8bb0015688e5",
	"https://medium.com/wardleymaps/the-play-and-a-decision-to-act-8eb796b1dff1",
	"https://medium.com/wardleymaps/getting-started-yourself-e1a359b785a2",
	"https://medium.com/wardleymaps/finding-a-new-purpose-8c60c9484d3b",
	"https://medium.com/wardleymaps/keeping-the-wolves-at-bay-93de21b6b2f8",
	"https://medium.com/wardleymaps/anticipation-89692e9b0ced",
	"https://medium.com/wardleymaps/i-wasnt-expecting-that-dcfe122a2234",
	"https://medium.com/wardleymaps/a-smorgasbord-of-the-slightly-useful-2498a1163dd6",
	"https://medium.com/wardleymaps/the-scenario-8bc05feee81",
	"https://medium.com/wardleymaps/something-wicked-this-way-comes-b028d5c607bf",
	"https://medium.com/wardleymaps/to-thine-own-self-be-true-543cfe2cd1a",
	"https://medium.com/wardleymaps/on-the-practice-of-scenario-planning-49eed8279e90",
	"https://medium.com/wardleymaps/round-round-get-around-i-loop-around-d88e865d4337",
	"https://medium.com/wardleymaps/to-infinity-and-beyond-c7a53ccd2a07",
	"https://medium.com/wardleymaps/better-for-less-58fe8c0a3aaa",
	"https://medium.com/wardleymaps/on-playing-chess-2634b825dbac",
	]

	result = lxml.etree.parse(io.StringIO('<!DOCTYPE html>\n<html><head><title>Wardley Maps</title></head><body/></html>'), lxml.etree.HTMLParser())

	for url in urls:
	fn = os.path.basename(url)
	if not os.path.exists(fn):
	print(f"downloading {url}")
	response = requests.get(url)
	response.raise_for_status()
	with open(fn, "wb") as f:
	f.write(response.content)

	# parse from cache
	with open(fn, "r") as f:
	html = lxml.etree.parse(f, lxml.etree.HTMLParser())

	# find text: it"s everything in the parent of the H1 element
	text = html.xpath("//h1/..")[0]

	# remove the first div with author image buttons and whatnot
	first_div = text.xpath('.//div')[0]
	first_div.getparent().remove(first_div)

	# remove all `name`, `id` and `class` attributes from everything
	for elem in text.xpath("//*"):
	for name in ["id", "name", "class"]:
	if name in elem.attrib:
	del elem.attrib[name]

	# decrapify images
	for figure in text.xpath(".//figure"):
	img = figure.xpath(".//img")[0]
	imgfn = os.path.basename(img.attrib["src"])[2:]
	if not os.path.exists(imgfn):
	print(f"downloading {img.attrib['src']}")
	response = requests.get(img.attrib["src"])
	response.raise_for_status()
	with open(imgfn, "wb") as f:
	f.write(response.content)
	text.replace(figure, lxml.builder.E.img(src=imgfn, style=figure[0].attrib["style"]))

	# keep image captions with image on page break
	for img in text.xpath("./img"):
	prev = img.getprevious()
	text.insert(text.index(prev), lxml.builder.E.div(img, prev, style="break-inside: avoid"))

	# prevent orphans and widows (does not seem to actually work)
	for p in text.xpath("./p"):
	p.attrib["style"] = "orphans: 2; widows: 2"

	# start chapters on new page
	for h1 in text.xpath("./h1")[1:]:
	h1.attrib["style"] = "break-before: always"

	# save content
	result.xpath('/html/body')[0].append(text)

	with open("wardley-maps.html", "wb") as f:
	result.write(f, pretty_print=True)