bear/seesaw_fp.sh

## seesaw_fp.sh
#!/bin/bash

#
# Distributed downloading script for fileplanet
# (shamelessly copied from the Mobile-Me project)
#
# Usage:
#   ./seesaw_fp.sh $YOURNICK $start $end
#
# To stop the script gracefully,  touch STOP in the script's
# working directory. The script will then finish the current
# download and stop.
#

youralias="$1"
startrange=$2
targetrange=$3
basedir=`pwd`

while [ ! -f STOP ]; do
  endrange=$((startrange + 99))

  echo "working on range $startrange to $endrange"

  mkdir "$startrange-$endrange"
  cd $basedir/$startrange-$endrange/

  for i in $(seq $startrange $endrange); do
    echo "Trying to download $i"

    # fileplanet returns a "302 Found" for non-existing IDs
    # redirecting to "Location: /error/error.shtml?aspxerrorpath=/autodownload.aspx
    # we don't want those files, so "--max-redirect=0"
    wget -nv -a pages_$startrange_$endrange.log --force-directories --max-redirect=0 http://www.fileplanet.com/${i}/download/

    # extract the session download link to the actual file we want
    # the URL is enclosed by single quotes. The second grep will get everything from http until the last '. The rev/cut will remove the trailing '.
    linktowget=$(grep default-file-download-link www.fileplanet.com/${i}/download/index.html 2>/dev/null | grep -Eo "http.*'" | rev | cut -c 2- | rev)

    if [ ! -n "${linktowget}" ]; then
      echo "No download link found."
    else
      echo "Download link found, downloading ${linktowget}"
      # download the file to the same directory as its download page HTML
      wget -nv -a files_$startrange_$endrange.log --directory-prefix=www.fileplanet.com/${i}/download/ "${linktowget}"
    fi
    echo "-----"
  done

  echo "Downloading finished! Yay!"

  echo -n "Counting files: "
  ls -1 www.fileplanet.com/ | wc -l

  echo -n "Getting the size: "
  du -hs www.fileplanet.com/

  echo "TARring!"
  cd $basedir
  tar -cf $startrange-$endrange.tar $startrange-$endrange/

  # This is not recommended to do automatically. The chunk might have been tiny or huge. Better check first.
  #echo "Uploading to archive.org!"
  #s3cmd --add-header x-archive-auto-make-bucket:1 --add-header "x-archive-meta-description:Files from Fileplanet (www.fileplanet.com), all files from the ID range $startrange to $endrange." put $startrange-$endrange.tar s3://FileplanetFiles_$startrange-$endrange
  #s3cmd put $startrange-$endrange/*.log s3://FileplanetFiles_$startrange-$endrange/

  echo "Done. YAAAY!"

  startrange=$((startrange + 100))
  if [[ $startrange >= $targetrange ]]; then
    break
  fi
done
	#!/bin/bash

	#
	# Distributed downloading script for fileplanet
	# (shamelessly copied from the Mobile-Me project)
	#
	# Usage:
	# ./seesaw_fp.sh $YOURNICK $start $end
	#
	# To stop the script gracefully, touch STOP in the script's
	# working directory. The script will then finish the current
	# download and stop.
	#

	youralias="$1"
	startrange=$2
	targetrange=$3
	basedir=`pwd`

	while [ ! -f STOP ]; do
	endrange=$((startrange + 99))

	echo "working on range $startrange to $endrange"

	mkdir "$startrange-$endrange"
	cd $basedir/$startrange-$endrange/

	for i in $(seq $startrange $endrange); do
	echo "Trying to download $i"

	# fileplanet returns a "302 Found" for non-existing IDs
	# redirecting to "Location: /error/error.shtml?aspxerrorpath=/autodownload.aspx
	# we don't want those files, so "--max-redirect=0"
	wget -nv -a pages_$startrange_$endrange.log --force-directories --max-redirect=0 http://www.fileplanet.com/${i}/download/

	# extract the session download link to the actual file we want
	# the URL is enclosed by single quotes. The second grep will get everything from http until the last '. The rev/cut will remove the trailing '.
	linktowget=$(grep default-file-download-link www.fileplanet.com/${i}/download/index.html 2>/dev/null \| grep -Eo "http.*'" \| rev \| cut -c 2- \| rev)

	if [ ! -n "${linktowget}" ]; then
	echo "No download link found."
	else
	echo "Download link found, downloading ${linktowget}"
	# download the file to the same directory as its download page HTML
	wget -nv -a files_$startrange_$endrange.log --directory-prefix=www.fileplanet.com/${i}/download/ "${linktowget}"
	fi
	echo "-----"
	done

	echo "Downloading finished! Yay!"

	echo -n "Counting files: "
	ls -1 www.fileplanet.com/ \| wc -l

	echo -n "Getting the size: "
	du -hs www.fileplanet.com/

	echo "TARring!"
	cd $basedir
	tar -cf $startrange-$endrange.tar $startrange-$endrange/

	# This is not recommended to do automatically. The chunk might have been tiny or huge. Better check first.
	#echo "Uploading to archive.org!"
	#s3cmd --add-header x-archive-auto-make-bucket:1 --add-header "x-archive-meta-description:Files from Fileplanet (www.fileplanet.com), all files from the ID range $startrange to $endrange." put $startrange-$endrange.tar s3://FileplanetFiles_$startrange-$endrange
	#s3cmd put $startrange-$endrange/*.log s3://FileplanetFiles_$startrange-$endrange/

	echo "Done. YAAAY!"

	startrange=$((startrange + 100))
	if [[ $startrange >= $targetrange ]]; then
	break
	fi
	done