Created
March 9, 2019 10:30
-
-
Save j0057/3c55982a61455324ced5f683ca0f996d to your computer and use it in GitHub Desktop.
Take a bunch of Medium URLs and hack them up into something more portable
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3.7 | |
# take a bunch of Medium URLs, download them, remove the crap and bundle them up into something for your e-reader | |
import lxml | |
import lxml.etree | |
import lxml.builder | |
import requests | |
import io | |
import os | |
import os.path | |
urls = [ | |
"https://medium.com/wardleymaps/on-being-lost-2ef5f05eb1ec", | |
"https://medium.com/wardleymaps/finding-a-path-cdb1249078c0", | |
"https://medium.com/wardleymaps/exploring-the-map-ad0266fad59b", | |
"https://medium.com/wardleymaps/doctrine-8bb0015688e5", | |
"https://medium.com/wardleymaps/the-play-and-a-decision-to-act-8eb796b1dff1", | |
"https://medium.com/wardleymaps/getting-started-yourself-e1a359b785a2", | |
"https://medium.com/wardleymaps/finding-a-new-purpose-8c60c9484d3b", | |
"https://medium.com/wardleymaps/keeping-the-wolves-at-bay-93de21b6b2f8", | |
"https://medium.com/wardleymaps/anticipation-89692e9b0ced", | |
"https://medium.com/wardleymaps/i-wasnt-expecting-that-dcfe122a2234", | |
"https://medium.com/wardleymaps/a-smorgasbord-of-the-slightly-useful-2498a1163dd6", | |
"https://medium.com/wardleymaps/the-scenario-8bc05feee81", | |
"https://medium.com/wardleymaps/something-wicked-this-way-comes-b028d5c607bf", | |
"https://medium.com/wardleymaps/to-thine-own-self-be-true-543cfe2cd1a", | |
"https://medium.com/wardleymaps/on-the-practice-of-scenario-planning-49eed8279e90", | |
"https://medium.com/wardleymaps/round-round-get-around-i-loop-around-d88e865d4337", | |
"https://medium.com/wardleymaps/to-infinity-and-beyond-c7a53ccd2a07", | |
"https://medium.com/wardleymaps/better-for-less-58fe8c0a3aaa", | |
"https://medium.com/wardleymaps/on-playing-chess-2634b825dbac", | |
] | |
result = lxml.etree.parse(io.StringIO('<!DOCTYPE html>\n<html><head><title>Wardley Maps</title></head><body/></html>'), lxml.etree.HTMLParser()) | |
for url in urls: | |
fn = os.path.basename(url) | |
if not os.path.exists(fn): | |
print(f"downloading {url}") | |
response = requests.get(url) | |
response.raise_for_status() | |
with open(fn, "wb") as f: | |
f.write(response.content) | |
# parse from cache | |
with open(fn, "r") as f: | |
html = lxml.etree.parse(f, lxml.etree.HTMLParser()) | |
# find text: it"s everything in the parent of the H1 element | |
text = html.xpath("//h1/..")[0] | |
# remove the first div with author image buttons and whatnot | |
first_div = text.xpath('.//div')[0] | |
first_div.getparent().remove(first_div) | |
# remove all `name`, `id` and `class` attributes from everything | |
for elem in text.xpath("//*"): | |
for name in ["id", "name", "class"]: | |
if name in elem.attrib: | |
del elem.attrib[name] | |
# decrapify images | |
for figure in text.xpath(".//figure"): | |
img = figure.xpath(".//img")[0] | |
imgfn = os.path.basename(img.attrib["src"])[2:] | |
if not os.path.exists(imgfn): | |
print(f"downloading {img.attrib['src']}") | |
response = requests.get(img.attrib["src"]) | |
response.raise_for_status() | |
with open(imgfn, "wb") as f: | |
f.write(response.content) | |
text.replace(figure, lxml.builder.E.img(src=imgfn, style=figure[0].attrib["style"])) | |
# keep image captions with image on page break | |
for img in text.xpath("./img"): | |
prev = img.getprevious() | |
text.insert(text.index(prev), lxml.builder.E.div(img, prev, style="break-inside: avoid")) | |
# prevent orphans and widows (does not seem to actually work) | |
for p in text.xpath("./p"): | |
p.attrib["style"] = "orphans: 2; widows: 2" | |
# start chapters on new page | |
for h1 in text.xpath("./h1")[1:]: | |
h1.attrib["style"] = "break-before: always" | |
# save content | |
result.xpath('/html/body')[0].append(text) | |
with open("wardley-maps.html", "wb") as f: | |
result.write(f, pretty_print=True) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment