#!/bin/bash # Opensource.ORG quick&dirty mirror script. # (C) 2007 Marcello Barnaba # Released under the terms of the DWTFYW License. # The absolute base path of the htdocs directory MIRROR_BASE="/home/httpd/antifork.org/htdocs" # The directory under which the contents will be downloaded MIRROR_DIR="opensource.antifork.org" # The URI to mirror MIRROR_URI="http://opensource.org" # A directory containing a placeholder page to show while # the script is running WIP_DIR="${MIRROR_DIR}-updating" # A Temporary working directory WORK_DIR="${MIRROR_DIR}-$RANDOM" # The path to the wget(1) program WGET="/usr/bin/wget --quiet" # Abort on errors .. yuck! set -e # ................................................. # NO USER SERVICEABLE PARTS BELOW THE DOTTED LINE . # ................................................. pushd "$MIRROR_BASE" > /dev/null # wget madness downloads robots.txt continuously, # so erase and re-download every time. for this kind # of site, it's ok. # rm -rf $MIRROR_DIR ln -s $WIP_DIR $MIRROR_DIR mkdir $WORK_DIR # Download the HTML and IMG stuff .. # $WGET --domains=opensource.org --convert-links --level=0 \ --mirror --page-requisites --no-host-directories \ $MIRROR_URI --exclude-directories '/user,/event' \ --directory-prefix="$WORK_DIR" --html-extension # Get all the stylesheets from the ToS page, one that will never # disappear, hopefully # STYLESHEETS=$(grep css $WORK_DIR/ToS.html | sed 's#.*import "\([a-z0-9/\.-]*\)".*#\1#') # Download all the images referenced in the CSS style sheets # for stylesheet in $STYLESHEETS; do base=`dirname $stylesheet` mkdir -p "$WORK_DIR/$base" $WGET -O "$WORK_DIR/$stylesheet" "$MIRROR_URI/$stylesheet" sed -n 's#.*url(\(.*\)).*#\1#p' < "$WORK_DIR/$stylesheet" | sort | uniq | while read image; do $WGET -O $WORK_DIR/$base/$image $MIRROR_URI/$base/$image done done # Remove the placeholder page rm -f $MIRROR_DIR # Put the mirror online mv $WORK_DIR $MIRROR_DIR popd > /dev/null # EOF