dirkjanfaber/thestoryofus.sh

## thestoryofus.sh
#/bin/bash

dir=$(mktemp --directory)
declare -a input=()

cat <<__EOT__ > ${dir}/metadata.txt
---
title: The Story of Us
author: Tim Urban
rights:  Creative Commons Non-Commercial Share Alike 3.0
language: en-US
...
__EOT__


# Filter for filtering out the non relevant parts of the html files.
cat <<__EOT__ > ${dir}/filter.xslt
<?xml version="1.0" encoding="utf-8"?>
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
  version="1.0">

<xsl:output method="html" omit-xml-declaration="yes" indent="yes"/>
<xsl:strip-space elements="*" />
<xsl:preserve-space elements="html body div" />

<xsl:template match="@* | node()">
    <xsl:copy>
        <xsl:apply-templates select="@* | node()"/>
    </xsl:copy>
 </xsl:template>

 <xsl:template match="div[@id='sidebar']"/>
 <xsl:template match="div[@id='disqus_thread']"/>
 <xsl:template match="div[@id='social-ads']"/>
 <xsl:template match="div[@class='related-posts']"/>
 <xsl:template match="div[@class='entry-author']"/>
 <xsl:template match="div[@class='mobile-menu']"/>
 <xsl:template match="div[@class='social']"/>
 <xsl:template match="div[@class='social_counter']"/>
 <xsl:template match="div[@class='logo-section']"/>
 <xsl:template match="div[@class='left']"/>
 <xsl:template match="div[@class='entry-nav main-color-bg']"/>
 <xsl:template match="footer[@id='footer']"/>
 <xsl:template match="script"/>
 <xsl:template match="style"/>
 <xsl:template match="link"/>
 <xsl:template match="nav"/>

 </xsl:stylesheet>
__EOT__

# Download all the blog posts and filter the html with the created filter
for link in \
  https://waitbutwhy.com/2019/08/story-intro.html \
  https://waitbutwhy.com/2019/08/fire-light.html \
  https://waitbutwhy.com/2019/08/giants.html \
  https://waitbutwhy.com/2019/09/stories.html \
  https://waitbutwhy.com/2019/09/enlightenment-kids.html \
  https://waitbutwhy.com/2019/09/mute-button.html \
  https://waitbutwhy.com/2019/09/american-brain.html \
  https://waitbutwhy.com/2019/09/thinking-ladder.html \
  https://waitbutwhy.com/2019/10/idea-labs-echo-chambers.html
do
  # Download the chapter
  curl -o ${dir}/${link##*/} ${link}
  # First normalize the html, then filter the html to strip out non-relevant
  # parts as scripts, links and commands
  hxnormalize ${dir}/${link##*/} |\
    xsltproc --html ${dir}/filter.xslt - > ${dir}/f-${link##*/}
  # Also replace the strong tags for chapters by h1 tags, so they will end up
  # as chapters in the final epub
  sed -e 's%<strong>.*\(Chapter[^<]*\)</strong>%<h1>\1</h1>%' -i ${dir}/f-${link##*/}
  input+=(${dir}/f-${link##*/})
done

# Convert the files to 1 epub file
pandoc -f html \
  -t epub3 \
  --epub-metadata=${dir}/metadata.txt \
  -o ${dir}/thestoryofus.epub \
  ${input[@]}

# This epub is way too large because of the 100MB of images, so we
# extract the epub and reduce the image size, before packing
# again. An epub file is just a zip file, so just extract it
mkdir ${dir}/epub
unzip ${dir}/thestoryofus.epub -d ${dir}/epub

# And convert all of the images to max 640x480. This reduces
# the size to about 39 MB
find ${dir}/epub/EPUB/media -exec convert \{} -verbose -resize 640x480\> \{} \;
# We also want to make them grayscale to save some more space
# This gets the total down to 21MB
find ${dir}/epub/EPUB/media -exec convert \{} -verbose -colorspace Gray -separate -average \{} \;
# And pack the file again, making the new epub 19MB
cd ${dir}/epub
zip -r /tmp/thestoryofus.epub .
cd -

# Remove the temporary files
rm -rf ${dir}
	#/bin/bash

	dir=$(mktemp --directory)
	declare -a input=()

	cat <<__EOT__ > ${dir}/metadata.txt
	---
	title: The Story of Us
	author: Tim Urban
	rights: Creative Commons Non-Commercial Share Alike 3.0
	language: en-US
	...
	__EOT__


	# Filter for filtering out the non relevant parts of the html files.
	cat <<__EOT__ > ${dir}/filter.xslt
	<?xml version="1.0" encoding="utf-8"?>
	<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
	version="1.0">

	<xsl:output method="html" omit-xml-declaration="yes" indent="yes"/>
	<xsl:strip-space elements="*" />
	<xsl:preserve-space elements="html body div" />

	<xsl:template match="@* \| node()">
	<xsl:copy>
	<xsl:apply-templates select="@* \| node()"/>
	</xsl:copy>
	</xsl:template>

	<xsl:template match="div[@id='sidebar']"/>
	<xsl:template match="div[@id='disqus_thread']"/>
	<xsl:template match="div[@id='social-ads']"/>
	<xsl:template match="div[@class='related-posts']"/>
	<xsl:template match="div[@class='entry-author']"/>
	<xsl:template match="div[@class='mobile-menu']"/>
	<xsl:template match="div[@class='social']"/>
	<xsl:template match="div[@class='social_counter']"/>
	<xsl:template match="div[@class='logo-section']"/>
	<xsl:template match="div[@class='left']"/>
	<xsl:template match="div[@class='entry-nav main-color-bg']"/>
	<xsl:template match="footer[@id='footer']"/>
	<xsl:template match="script"/>
	<xsl:template match="style"/>
	<xsl:template match="link"/>
	<xsl:template match="nav"/>

	</xsl:stylesheet>
	__EOT__

	# Download all the blog posts and filter the html with the created filter
	for link in \
	https://waitbutwhy.com/2019/08/story-intro.html \
	https://waitbutwhy.com/2019/08/fire-light.html \
	https://waitbutwhy.com/2019/08/giants.html \
	https://waitbutwhy.com/2019/09/stories.html \
	https://waitbutwhy.com/2019/09/enlightenment-kids.html \
	https://waitbutwhy.com/2019/09/mute-button.html \
	https://waitbutwhy.com/2019/09/american-brain.html \
	https://waitbutwhy.com/2019/09/thinking-ladder.html \
	https://waitbutwhy.com/2019/10/idea-labs-echo-chambers.html
	do
	# Download the chapter
	curl -o ${dir}/${link##*/} ${link}
	# First normalize the html, then filter the html to strip out non-relevant
	# parts as scripts, links and commands
	hxnormalize ${dir}/${link##*/} \|\
	xsltproc --html ${dir}/filter.xslt - > ${dir}/f-${link##*/}
	# Also replace the strong tags for chapters by h1 tags, so they will end up
	# as chapters in the final epub
	sed -e 's%<strong>.\(Chapter[^<]\)</strong>%<h1>\1</h1>%' -i ${dir}/f-${link##*/}
	input+=(${dir}/f-${link##*/})
	done

	# Convert the files to 1 epub file
	pandoc -f html \
	-t epub3 \
	--epub-metadata=${dir}/metadata.txt \
	-o ${dir}/thestoryofus.epub \
	${input[@]}

	# This epub is way too large because of the 100MB of images, so we
	# extract the epub and reduce the image size, before packing
	# again. An epub file is just a zip file, so just extract it
	mkdir ${dir}/epub
	unzip ${dir}/thestoryofus.epub -d ${dir}/epub

	# And convert all of the images to max 640x480. This reduces
	# the size to about 39 MB
	find ${dir}/epub/EPUB/media -exec convert \{} -verbose -resize 640x480\> \{} \;
	# We also want to make them grayscale to save some more space
	# This gets the total down to 21MB
	find ${dir}/epub/EPUB/media -exec convert \{} -verbose -colorspace Gray -separate -average \{} \;
	# And pack the file again, making the new epub 19MB
	cd ${dir}/epub
	zip -r /tmp/thestoryofus.epub .
	cd -

	# Remove the temporary files
	rm -rf ${dir}