Skip to content

Instantly share code, notes, and snippets.

@RouxRC
Created June 30, 2015 07:28
Show Gist options
  • Save RouxRC/3392c1ea43e9da3dcaaa to your computer and use it in GitHub Desktop.
Save RouxRC/3392c1ea43e9da3dcaaa to your computer and use it in GitHub Desktop.
Backup anima-vet.fr
#!/bin/bash
rurl="anima-vet.fr/"
function bingrep {
cat "$2" | tr -d '\000' | grep "$1"
}
function handle_page {
url=$1
if ! echo "$url" | grep "$rurl" > /dev/null; then
url="$rurl$1"
fi
if [ -z "$2" ]; then
page=`echo "$url" | sed 's/^.*\.fr\///'`
else
page=$2
fi
if test -e "$page"; then
return
fi
wget --quiet "$url" -O "$page"
if echo "$page" | grep "\.htm" > /dev/null; then
echo "Downloading $url to $page..."
dir=`echo "$page" | sed 's/\.htm.*$/_fichiers/'`
mkdir -p "$dir"
bingrep "$dir/" "$page" |
sed 's/.*\(href\|src\|target\)="//' |
sed 's/".*$//' |
while read f; do
handle_page "$f"
done
bingrep 'href=[a-z]' "$page" |
sed 's/^.*href=//' |
sed 's/>.*$//' |
while read f; do
handle_page "$f"
done
bingrep "href=\"http://\(www\.\)\?$rurl[a-z]" "$page" |
sed 's/^.*href="//' |
sed 's/".*$//' |
sed 's/\/www\./\//' |
sort -u |
while read f; do
handle_page "$f"
done
else
echo ". $page"
fi
}
handle_page "$rurl" "index.html"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment