Created
May 13, 2012 19:05
-
-
Save bear/2689781 to your computer and use it in GitHub Desktop.
simple seesaw like script for archiveteam's fileplanet project
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# | |
# Distributed downloading script for fileplanet | |
# (shamelessly copied from the Mobile-Me project) | |
# | |
# Usage: | |
# ./seesaw_fp.sh $YOURNICK $start $end | |
# | |
# To stop the script gracefully, touch STOP in the script's | |
# working directory. The script will then finish the current | |
# download and stop. | |
# | |
youralias="$1" | |
startrange=$2 | |
targetrange=$3 | |
basedir=`pwd` | |
while [ ! -f STOP ]; do | |
endrange=$((startrange + 99)) | |
echo "working on range $startrange to $endrange" | |
mkdir "$startrange-$endrange" | |
cd $basedir/$startrange-$endrange/ | |
for i in $(seq $startrange $endrange); do | |
echo "Trying to download $i" | |
# fileplanet returns a "302 Found" for non-existing IDs | |
# redirecting to "Location: /error/error.shtml?aspxerrorpath=/autodownload.aspx | |
# we don't want those files, so "--max-redirect=0" | |
wget -nv -a pages_$startrange_$endrange.log --force-directories --max-redirect=0 http://www.fileplanet.com/${i}/download/ | |
# extract the session download link to the actual file we want | |
# the URL is enclosed by single quotes. The second grep will get everything from http until the last '. The rev/cut will remove the trailing '. | |
linktowget=$(grep default-file-download-link www.fileplanet.com/${i}/download/index.html 2>/dev/null | grep -Eo "http.*'" | rev | cut -c 2- | rev) | |
if [ ! -n "${linktowget}" ]; then | |
echo "No download link found." | |
else | |
echo "Download link found, downloading ${linktowget}" | |
# download the file to the same directory as its download page HTML | |
wget -nv -a files_$startrange_$endrange.log --directory-prefix=www.fileplanet.com/${i}/download/ "${linktowget}" | |
fi | |
echo "-----" | |
done | |
echo "Downloading finished! Yay!" | |
echo -n "Counting files: " | |
ls -1 www.fileplanet.com/ | wc -l | |
echo -n "Getting the size: " | |
du -hs www.fileplanet.com/ | |
echo "TARring!" | |
cd $basedir | |
tar -cf $startrange-$endrange.tar $startrange-$endrange/ | |
# This is not recommended to do automatically. The chunk might have been tiny or huge. Better check first. | |
#echo "Uploading to archive.org!" | |
#s3cmd --add-header x-archive-auto-make-bucket:1 --add-header "x-archive-meta-description:Files from Fileplanet (www.fileplanet.com), all files from the ID range $startrange to $endrange." put $startrange-$endrange.tar s3://FileplanetFiles_$startrange-$endrange | |
#s3cmd put $startrange-$endrange/*.log s3://FileplanetFiles_$startrange-$endrange/ | |
echo "Done. YAAAY!" | |
startrange=$((startrange + 100)) | |
if [[ $startrange >= $targetrange ]]; then | |
break | |
fi | |
done |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment