m-ueberall/lxd-latest_readthedocs_io.sh

## lxd-latest_readthedocs_io.sh
#!/bin/bash

#quick and dirty script to download current version of HTML files from https://lxd.readthedocs.io/en/latest/ as both HTML, PDF export is broken atm
#required external tools: wget, egrep, sed, perl, find, google-chrome-[un]stable, pdftk
BASEDIR=lxd.readthedocs.io/en/latest

#download all the files, but make sure nothing is "in the way" first
rm -Rf ./$BASEDIR/
echo Downloading all HTML files from https://$BASEDIR/ ...
LANG=C wget -l 0 -r -np -k https://$BASEDIR/ 2>&1 | egrep '\.html.*saved' >lxd-latest_readthedocs_io.log

#the following list of html files is already in order due to the "prev"/"next" links used on the bottom of the pages
HTMLFILES=$(cat lxd-latest_readthedocs_io.log | sed -e "s|' saved.*$||" -e "s|^.*'||")
PDFFILES=${HTMLFILES//.html/.html.pdf}

#(temporarily) get rid of the remaining left margin after we remove the menu contents later on
sed -i.bak 's|wy-nav-content-wrap{margin-left:300px|wy-nav-content-wrap{margin-left:0px|g' $BASEDIR/css/theme.css
find $BASEDIR -name "*.html" -exec cp -a {} {}.bak \;

#strip unwanted parts of the HTML files
echo Stripping unwanted parts of HTML files for PDF output ...
find $BASEDIR -name "*.html" -exec perl -i -pe 'BEGIN{undef $/;} s@<nav data-toggle="wy-nav-shift" class="wy-nav-side stickynav">.*?</nav>@@smg; s@<div class="rst-versions" data-toggle="rst-versions" role="note" aria-label="versions">.*?</div>.*?</div>@@smg' {} \;

#generate individual PDF files from HTML files (works with google-chrome-stable as well)
echo Generating PDF files ...
find $BASEDIR -name "*.html" -exec google-chrome-unstable --headless --print-to-pdf={}.pdf {} \; 2>/dev/null
#create single PDF file
pdftk $PDFFILES cat output $BASEDIR/lxd-latest.pdf

#put backups back into place
find $BASEDIR -type f \( -name "*.html" -o -name "*.css" \) -exec mv -f {}.bak {} \; 2>/dev/null

#clean up here...
echo Done.
	#!/bin/bash

	#quick and dirty script to download current version of HTML files from https://lxd.readthedocs.io/en/latest/ as both HTML, PDF export is broken atm
	#required external tools: wget, egrep, sed, perl, find, google-chrome-[un]stable, pdftk
	BASEDIR=lxd.readthedocs.io/en/latest

	#download all the files, but make sure nothing is "in the way" first
	rm -Rf ./$BASEDIR/
	echo Downloading all HTML files from https://$BASEDIR/ ...
	LANG=C wget -l 0 -r -np -k https://$BASEDIR/ 2>&1 \| egrep '\.html.*saved' >lxd-latest_readthedocs_io.log

	#the following list of html files is already in order due to the "prev"/"next" links used on the bottom of the pages
	HTMLFILES=$(cat lxd-latest_readthedocs_io.log \| sed -e "s\|' saved.$\|\|" -e "s\|^.'\|\|")
	PDFFILES=${HTMLFILES//.html/.html.pdf}

	#(temporarily) get rid of the remaining left margin after we remove the menu contents later on
	sed -i.bak 's\|wy-nav-content-wrap{margin-left:300px\|wy-nav-content-wrap{margin-left:0px\|g' $BASEDIR/css/theme.css
	find $BASEDIR -name "*.html" -exec cp -a {} {}.bak \;

	#strip unwanted parts of the HTML files
	echo Stripping unwanted parts of HTML files for PDF output ...
	find $BASEDIR -name ".html" -exec perl -i -pe 'BEGIN{undef $/;} s@<nav data-toggle="wy-nav-shift" class="wy-nav-side stickynav">.?</nav>@@smg; s@<div class="rst-versions" data-toggle="rst-versions" role="note" aria-label="versions">.?</div>.?</div>@@smg' {} \;

	#generate individual PDF files from HTML files (works with google-chrome-stable as well)
	echo Generating PDF files ...
	find $BASEDIR -name "*.html" -exec google-chrome-unstable --headless --print-to-pdf={}.pdf {} \; 2>/dev/null
	#create single PDF file
	pdftk $PDFFILES cat output $BASEDIR/lxd-latest.pdf

	#put backups back into place
	find $BASEDIR -type f \( -name ".html" -o -name ".css" \) -exec mv -f {}.bak {} \; 2>/dev/null

	#clean up here...
	echo Done.