Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Archive web pages with wget, optionally compressing with tar
#!/bin/bash
# * Defaults
compression=xz
subdir="web"
# * Functions
function debug {
if [[ $debug ]]
then
function debug {
echo "DEBUG: $@" >&2
}
debug "$@"
else
function debug {
true
}
fi
}
function error {
echo "ERROR: $@" >&2
((errors++)) # Initializes automatically
}
function die {
error "$@"
exit $errors
}
function usage {
cat <<EOF
$0 [OPTIONS] URL ...
Download web pages at URLs with images, stylesheets, etc. to the
current directory, optionally compressing into an archive.
Page resources are stored in a subdirectory, and HTML files are
symlinked to the current directory (which may cause page resources to
not display, so it may be necessary to open the symlink targets
directly).
Options
-d, --debug Print debug info
-h, --help I need somebody!
-a, --archive Compress downloaded files into tar archive
--archive-name NAME Set archive name (not including extension)
--compress-with EXT tar-supported compression method (xz by default)
-f, --flat Download files into one directory instead of domain-based hierarchy.
When not set, link HTML files into top-level subdir.
-s, --subdir NAME Set subdirectory name (web by default)
EOF
}
# * Args
args=$(getopt -n "$0" -o adfh -l archive,archive-name:,compress-with:,debug,flat,help,subdir: -- "$@") || exit 1
eval set -- "$args"
while true
do
case "$1" in
-d|--debug)
debug=true
;;
-h|--help)
usage
exit
;;
-a|--archive)
archive=true
;;
--archive-name|--subdir)
# NOTE: We use the subdir as the archive name.
shift
subdir="$1"
;;
--compress-with)
shift
compression="$1"
;;
-f|--flat)
flat=true
;;
--)
# Remaining args (required; do not remove)
shift
rest=("$@")
break
;;
esac
shift
done
debug "ARGS: $args"
debug "Remaining args: ${rest[@]}"
# ** Prepare wget options
wget_options=(
# Don't download JavaScript
--ignore-tags=script
# Don't download fonts
# TODO: Make optional; add other extensions
--reject=ttf
# Skip robots.txt (not only to avoid having things fail to download, but because wget saves the robots.txt files!)
--execute robots=off
# Give downloaded HTML files .html extension
--adjust-extension
# Span hosts (e.g. for forum pages that include images from image hosts)
# TODO: Make optional
--span-hosts
# Convert links to work locally
--convert-links
# Backup original version of converted files
# TODO: Make optional
#--backup-converted
# Get page resources (requisites). This is the main option that retrieves page elements/resources.
--page-requisites
# Don't re-download page requisites that already exist locally and
# are up-to-date with the server (if the server returns
# Last-Modified).
--timestamping
# Save resources into "web" directory.
--directory-prefix="$subdir"
)
[[ $flat ]] && wget_options+=(--no-directories)
# * Main
# Record whether subdir already exists.
[[ -e "$subdir" ]] && subdir_existed=true
# ** Download page and resources
# Ignore non-zero exit codes from wget, which seem to be meaningless in recursive mode.
wget "${wget_options[@]}" "${rest[@]}" || error "wget exited with: $?"
# ** Link created HTML files
if ! [[ $flat ]]
then
find "$subdir" -type f -iname "*.html" | while read file
do
# TODO: Allow setting filename (which means only one file can be linked, but usually that's what we want).
filename=$(basename "$file")
ln -srv "$file" "${subdir}/${filename}"
done
fi
# ** Compress to archive file
if [[ $archive ]]
then
archive_name="$(pwd)/${subdir}.tar.${compression}"
# Don't overwrite existing file
[[ -e $archive_name ]] && die "File exists: $archive_name"
# Make archive from inside subdir to avoid leading path name.
cd "$subdir" || die "Couldn't change to $subdir"
tar --create --auto-compress --file "$archive_name" ./ || error "Unable to make archive"
cd ..
# Remove subdir if it didn't exist before running this script.
if [[ $subdir_existed ]]
then
echo "NOT deleting existing subdir: $subdir" >&2
else
rm -rf "$subdir"
fi
fi
exit $errors
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.