Created
August 26, 2018 07:58
-
-
Save m-ueberall/b1e6e3a4dfe8b615fd137768a0aeb19c to your computer and use it in GitHub Desktop.
Quick and dirty script to work around the currently broken/missing HTML, PDF archive download from https://lxd.readthedocs.io/
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
#quick and dirty script to download current version of HTML files from https://lxd.readthedocs.io/en/latest/ as both HTML, PDF export is broken atm | |
#required external tools: wget, egrep, sed, perl, find, google-chrome-[un]stable, pdftk | |
BASEDIR=lxd.readthedocs.io/en/latest | |
#download all the files, but make sure nothing is "in the way" first | |
rm -Rf ./$BASEDIR/ | |
echo Downloading all HTML files from https://$BASEDIR/ ... | |
LANG=C wget -l 0 -r -np -k https://$BASEDIR/ 2>&1 | egrep '\.html.*saved' >lxd-latest_readthedocs_io.log | |
#the following list of html files is already in order due to the "prev"/"next" links used on the bottom of the pages | |
HTMLFILES=$(cat lxd-latest_readthedocs_io.log | sed -e "s|' saved.*$||" -e "s|^.*'||") | |
PDFFILES=${HTMLFILES//.html/.html.pdf} | |
#(temporarily) get rid of the remaining left margin after we remove the menu contents later on | |
sed -i.bak 's|wy-nav-content-wrap{margin-left:300px|wy-nav-content-wrap{margin-left:0px|g' $BASEDIR/css/theme.css | |
find $BASEDIR -name "*.html" -exec cp -a {} {}.bak \; | |
#strip unwanted parts of the HTML files | |
echo Stripping unwanted parts of HTML files for PDF output ... | |
find $BASEDIR -name "*.html" -exec perl -i -pe 'BEGIN{undef $/;} s@<nav data-toggle="wy-nav-shift" class="wy-nav-side stickynav">.*?</nav>@@smg; s@<div class="rst-versions" data-toggle="rst-versions" role="note" aria-label="versions">.*?</div>.*?</div>@@smg' {} \; | |
#generate individual PDF files from HTML files (works with google-chrome-stable as well) | |
echo Generating PDF files ... | |
find $BASEDIR -name "*.html" -exec google-chrome-unstable --headless --print-to-pdf={}.pdf {} \; 2>/dev/null | |
#create single PDF file | |
pdftk $PDFFILES cat output $BASEDIR/lxd-latest.pdf | |
#put backups back into place | |
find $BASEDIR -type f \( -name "*.html" -o -name "*.css" \) -exec mv -f {}.bak {} \; 2>/dev/null | |
#clean up here... | |
echo Done. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment