Skip to content

Instantly share code, notes, and snippets.

@j0057
Created March 9, 2019 10:30
Show Gist options
  • Save j0057/3c55982a61455324ced5f683ca0f996d to your computer and use it in GitHub Desktop.
Save j0057/3c55982a61455324ced5f683ca0f996d to your computer and use it in GitHub Desktop.
Take a bunch of Medium URLs and hack them up into something more portable
#!/usr/bin/env python3.7
# take a bunch of Medium URLs, download them, remove the crap and bundle them up into something for your e-reader
import lxml
import lxml.etree
import lxml.builder
import requests
import io
import os
import os.path
urls = [
"https://medium.com/wardleymaps/on-being-lost-2ef5f05eb1ec",
"https://medium.com/wardleymaps/finding-a-path-cdb1249078c0",
"https://medium.com/wardleymaps/exploring-the-map-ad0266fad59b",
"https://medium.com/wardleymaps/doctrine-8bb0015688e5",
"https://medium.com/wardleymaps/the-play-and-a-decision-to-act-8eb796b1dff1",
"https://medium.com/wardleymaps/getting-started-yourself-e1a359b785a2",
"https://medium.com/wardleymaps/finding-a-new-purpose-8c60c9484d3b",
"https://medium.com/wardleymaps/keeping-the-wolves-at-bay-93de21b6b2f8",
"https://medium.com/wardleymaps/anticipation-89692e9b0ced",
"https://medium.com/wardleymaps/i-wasnt-expecting-that-dcfe122a2234",
"https://medium.com/wardleymaps/a-smorgasbord-of-the-slightly-useful-2498a1163dd6",
"https://medium.com/wardleymaps/the-scenario-8bc05feee81",
"https://medium.com/wardleymaps/something-wicked-this-way-comes-b028d5c607bf",
"https://medium.com/wardleymaps/to-thine-own-self-be-true-543cfe2cd1a",
"https://medium.com/wardleymaps/on-the-practice-of-scenario-planning-49eed8279e90",
"https://medium.com/wardleymaps/round-round-get-around-i-loop-around-d88e865d4337",
"https://medium.com/wardleymaps/to-infinity-and-beyond-c7a53ccd2a07",
"https://medium.com/wardleymaps/better-for-less-58fe8c0a3aaa",
"https://medium.com/wardleymaps/on-playing-chess-2634b825dbac",
]
result = lxml.etree.parse(io.StringIO('<!DOCTYPE html>\n<html><head><title>Wardley Maps</title></head><body/></html>'), lxml.etree.HTMLParser())
for url in urls:
fn = os.path.basename(url)
if not os.path.exists(fn):
print(f"downloading {url}")
response = requests.get(url)
response.raise_for_status()
with open(fn, "wb") as f:
f.write(response.content)
# parse from cache
with open(fn, "r") as f:
html = lxml.etree.parse(f, lxml.etree.HTMLParser())
# find text: it"s everything in the parent of the H1 element
text = html.xpath("//h1/..")[0]
# remove the first div with author image buttons and whatnot
first_div = text.xpath('.//div')[0]
first_div.getparent().remove(first_div)
# remove all `name`, `id` and `class` attributes from everything
for elem in text.xpath("//*"):
for name in ["id", "name", "class"]:
if name in elem.attrib:
del elem.attrib[name]
# decrapify images
for figure in text.xpath(".//figure"):
img = figure.xpath(".//img")[0]
imgfn = os.path.basename(img.attrib["src"])[2:]
if not os.path.exists(imgfn):
print(f"downloading {img.attrib['src']}")
response = requests.get(img.attrib["src"])
response.raise_for_status()
with open(imgfn, "wb") as f:
f.write(response.content)
text.replace(figure, lxml.builder.E.img(src=imgfn, style=figure[0].attrib["style"]))
# keep image captions with image on page break
for img in text.xpath("./img"):
prev = img.getprevious()
text.insert(text.index(prev), lxml.builder.E.div(img, prev, style="break-inside: avoid"))
# prevent orphans and widows (does not seem to actually work)
for p in text.xpath("./p"):
p.attrib["style"] = "orphans: 2; widows: 2"
# start chapters on new page
for h1 in text.xpath("./h1")[1:]:
h1.attrib["style"] = "break-before: always"
# save content
result.xpath('/html/body')[0].append(text)
with open("wardley-maps.html", "wb") as f:
result.write(f, pretty_print=True)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment