Skip to content

Instantly share code, notes, and snippets.

Forked from dirkjanfaber/
Created February 22, 2023 07:47
What would you like to do?
Download the story of us as epub (waitbutwhy)
dir=$(mktemp --directory)
declare -a input=()
cat <<__EOT__ > ${dir}/metadata.txt
title: The Story of Us
author: Tim Urban
rights: Creative Commons Non-Commercial Share Alike 3.0
language: en-US
# Filter for filtering out the non relevant parts of the html files.
cat <<__EOT__ > ${dir}/filter.xslt
<?xml version="1.0" encoding="utf-8"?>
<xsl:stylesheet xmlns:xsl=""
<xsl:output method="html" omit-xml-declaration="yes" indent="yes"/>
<xsl:strip-space elements="*" />
<xsl:preserve-space elements="html body div" />
<xsl:template match="@* | node()">
<xsl:apply-templates select="@* | node()"/>
<xsl:template match="div[@id='sidebar']"/>
<xsl:template match="div[@id='disqus_thread']"/>
<xsl:template match="div[@id='social-ads']"/>
<xsl:template match="div[@class='related-posts']"/>
<xsl:template match="div[@class='entry-author']"/>
<xsl:template match="div[@class='mobile-menu']"/>
<xsl:template match="div[@class='social']"/>
<xsl:template match="div[@class='social_counter']"/>
<xsl:template match="div[@class='logo-section']"/>
<xsl:template match="div[@class='left']"/>
<xsl:template match="div[@class='entry-nav main-color-bg']"/>
<xsl:template match="footer[@id='footer']"/>
<xsl:template match="script"/>
<xsl:template match="style"/>
<xsl:template match="link"/>
<xsl:template match="nav"/>
# Download all the blog posts and filter the html with the created filter
for link in \ \ \ \ \ \ \ \ \
# Download the chapter
curl -o ${dir}/${link##*/} ${link}
# First normalize the html, then filter the html to strip out non-relevant
# parts as scripts, links and commands
hxnormalize ${dir}/${link##*/} |\
xsltproc --html ${dir}/filter.xslt - > ${dir}/f-${link##*/}
# Also replace the strong tags for chapters by h1 tags, so they will end up
# as chapters in the final epub
sed -e 's%<strong>.*\(Chapter[^<]*\)</strong>%<h1>\1</h1>%' -i ${dir}/f-${link##*/}
# Convert the files to 1 epub file
pandoc -f html \
-t epub3 \
--epub-metadata=${dir}/metadata.txt \
-o ${dir}/thestoryofus.epub \
# This epub is way too large because of the 100MB of images, so we
# extract the epub and reduce the image size, before packing
# again. An epub file is just a zip file, so just extract it
mkdir ${dir}/epub
unzip ${dir}/thestoryofus.epub -d ${dir}/epub
# And convert all of the images to max 640x480. This reduces
# the size to about 39 MB
find ${dir}/epub/EPUB/media -exec convert \{} -verbose -resize 640x480\> \{} \;
# We also want to make them grayscale to save some more space
# This gets the total down to 21MB
find ${dir}/epub/EPUB/media -exec convert \{} -verbose -colorspace Gray -separate -average \{} \;
# And pack the file again, making the new epub 19MB
cd ${dir}/epub
zip -r /tmp/thestoryofus.epub .
cd -
# Remove the temporary files
rm -rf ${dir}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment