Skip to content

Instantly share code, notes, and snippets.



Created Apr 2, 2017
What would you like to do?
ts := $(shell /bin/date "+%Y%m%d%H%M%S")
root_domain :=
archive_file := ${root_domain}-archive-$(ts)
all: clean generate
rm -rf *o archive;
rm -rf *o ${root_domain}-archive*.tar.gz;
rm -rf *o ${root_domain}-archive*;
# Scrape the site
httrack "https://${root_domain}" -w -O "./archive" -I0 -N "%h%p/%n/index%[page].%t" -%v --robots=0 -c10 -%e0
# Move the archive out of the httrack directory structure and delete it's cache
mv archive/${root_domain}/ ${archive_file}
rm -rf *o archive
# Move the index page and delete it's directory
mv ${archive_file}/index/index.html ${archive_file}/index.html
rm -rf *0 ${archive_file}/index/
# copy the 404 page
mkdir ${archive_file}/404/
cp 404.html ${archive_file}/404/index.html
# Update the links to remove 'index.html' from the end - this requires correct server setup
find ${archive_file} -name '*.html' -exec sed -i -e s/index\\/index.html//g {} \;
find ${archive_file} -name '*.html' -exec sed -i -e s/index.html//g {} \;
# Remove all the *-e files
find ${archive_file} -name '*.html-e' -delete
# create the tarball
tar -cvzf ${archive_file}.tar.gz ${archive_file}/
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment