#!/bin/bash
# Opensource.ORG quick&dirty mirror script.
# (C) 2007 Marcello Barnaba <vjt@openssl.it>
# Released under the terms of the DWTFYW License.
# The absolute base path of the htdocs directory
MIRROR_BASE="/home/httpd/antifork.org/htdocs"
# The directory under which the contents will be downloaded
MIRROR_DIR="opensource.antifork.org"
# The URI to mirror
MIRROR_URI="http://opensource.org"
# A directory containing a placeholder page to show while
# the script is running
WIP_DIR="${MIRROR_DIR}-updating"
# A Temporary working directory
WORK_DIR="${MIRROR_DIR}-$RANDOM"
# The path to the wget(1) program
WGET="/usr/bin/wget --quiet"
# Abort on errors .. yuck!
set -e
# .................................................
# NO USER SERVICEABLE PARTS BELOW THE DOTTED LINE .
# .................................................
pushd "$MIRROR_BASE" > /dev/null
# wget madness downloads robots.txt continuously,
# so erase and re-download every time. for this kind
# of site, it's ok.
#
rm -rf $MIRROR_DIR
ln -s $WIP_DIR $MIRROR_DIR
mkdir $WORK_DIR
# Download the HTML and IMG stuff ..
#
$WGET --domains=opensource.org --convert-links --level=0 \
--mirror --page-requisites --no-host-directories \
$MIRROR_URI --exclude-directories '/user,/event' \
--directory-prefix="$WORK_DIR" --html-extension
# Get all the stylesheets from the ToS page, one that will never
# disappear, hopefully
#
STYLESHEETS=$(grep css $WORK_DIR/ToS.html | sed 's#.*import "\([a-z0-9/\.-]*\)".*#\1#')
# Download all the images referenced in the CSS style sheets
#
for stylesheet in $STYLESHEETS; do
base=`dirname $stylesheet`
mkdir -p "$WORK_DIR/$base"
$WGET -O "$WORK_DIR/$stylesheet" "$MIRROR_URI/$stylesheet"
sed -n 's#.*url(\(.*\)).*#\1#p' < "$WORK_DIR/$stylesheet" | sort | uniq | while read image; do
$WGET -O $WORK_DIR/$base/$image $MIRROR_URI/$base/$image
done
done
# Remove the placeholder page
rm -f $MIRROR_DIR
# Put the mirror online
mv $WORK_DIR $MIRROR_DIR
popd > /dev/null
# EOF