Skip to content

Instantly share code, notes, and snippets.

@philcryer
Created April 10, 2012 15:56
Show Gist options
  • Save philcryer/2352375 to your computer and use it in GitHub Desktop.
Save philcryer/2352375 to your computer and use it in GitHub Desktop.
a bash script to perform batch downloads of Internet Archive (archive.org) materials, via record ids as listed in todo.txt
!/bin/bash
#
################################################################################
#
# File : grabby.sh
# Usage : ./grabby.sh
# Author : phil.cryer@mobot.org
# Date created : 2009-10-10
# Last updated : 2012-04-10
# Source : http://code.google.com/p/bhl-bits/utilities/grabby
# Description : a bash script to perform batch downloads of Internet Archive
# (archive.org) materials, via record ids as listed in todo.txt
# Requires : Bash, wget
# (optional) : fast/stable internet connection, paitience, sense of humor
#
################################################################################
#
# Copyright (c) 2012, Biodiversity Heritage Library
#
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer. Redistributions in binary
# form must reproduce the above copyright notice, this list of conditions and the
# following disclaimer in the documentation and/or other materials provided with
# the distribution. Neither the name of the Biodiversity Heritage Library nor
# the names of its contributors may be used to endorse or promote products
# derived from this software without specific prior written permission. THIS
# SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
################################################################################
# More information about the BSD License can be found here:
# http://www.opensource.org/licenses/bsd-license.php
################################################################################
#
########################################
# Check todo, set time, make directory
########################################
if [ ! -f "todo.txt" ]; then
echo "can't find todo.txt, fail"
echo "define IA identifiers in todo.txt (one per line) and rerun"
exit 0
fi
clear
sum=0
num=1
START_TIME=`date "+%H:%M:%S %Y-%m-%d%n"`
echo "Starting download at ${START_TIME}"
echo "------------------------------------------------------"
START=`date +%s`
PUID=${START}
COMPLETE_DIR=complete.${PUID}
mkdir ${COMPLETE_DIR}
MANIFEST=00_manifest.${PUID}
#
########################################
# Inventory do/done downloads
########################################
cat todo.txt | while read BOOK_ID
do
BASE_URL="http://archive.org/download/${BOOK_ID}"
sum=$(($sum + $num))
echo -n "$sum" > current.status.txt "of "
TOTAL=`cat todo.txt | wc -l`
echo ${TOTAL} >> current.status.txt
echo "title: ${BOOK_ID}" >> current.status.txt
echo -n " [ `head -n1 current.status.txt` ] Title: ${BOOK_ID}"; echo
if [ -d "${BOOK_ID}" ]; then
echo " - Existing data found, continuing previous download..."
if [ -f "${BOOK_ID}/index.html" ]; then
rm ${BOOK_ID}/index.html
fi
fi
#
########################################
# Build download list
########################################
wget -p -c -nc -nH -nd -erobots=off -P${BOOK_ID} ${BASE_URL}
grep "<a href=" ${BOOK_ID}/index.html | grep ${BOOK_ID} | grep -v "<h1" | cut -d">" -f1 | cut -d"\"" -f2 >> ${BOOK_ID}/xml_files_tmp
cat ${BOOK_ID}/xml_files_tmp | sed s/^/http:\\/\\/archive.org\\/download\\/$BOOK_ID\\// > ${BOOK_ID}/download.urls
rm ${BOOK_ID}/index.html; rm ${BOOK_ID}/xml_files_tmp*
#
########################################
# Download files
########################################
# download all related files (DEFAULT)
wget -p -c -i ${BOOK_ID}/download.urls -nc -nH -nd -erobots=off -P${BOOK_ID} ${BASE_URL}
# Notice: by default we now download every file related to the record id
# if you want to limit this, manually grep out files here. in this example,
# it will only djvu.txt files - otherwise limit by file prefix on the wget line below
#grep djvu.txt ${BOOK_ID}/download.urls > ${BOOK_ID}/download.urls-single
#mv ${BOOK_ID}/download.urls-single ${BOOK_ID}/download.urls
# or to limit downloads to only xml files
#wget -p -c -A '.xml' -i ${BOOK_ID}/download.urls -nc -nH -nd -erobots=off -P${BOOK_ID} ${BASE_URL}
#
########################################
# Clean up download directory
########################################
rm ${BOOK_ID}/download.urls
if [ -f "${BOOK_ID}/index.html" ]; then
rm ${BOOK_ID}/index.html
fi
mv ${BOOK_ID} ${COMPLETE_DIR}
echo "Download of ${BOOK_ID} complete."
done
#
########################################
# Summarize downloads, time, etc
########################################
TOTAL_DATA=`du -hc | tail -n1`
TOTAL_BOOKS=`cat current.status.txt | head -n1 | cut -d" " -f3`
END_TIME=`date "+%H:%M:%S %Y-%m-%d%n"`
rm current.status.txt
echo "------------------------------------------------------" > ${COMPLETE_DIR}/${MANIFEST}
echo "Start time ${START_TIME}" >> ${COMPLETE_DIR}/${MANIFEST}
echo "Finish time ${END_TIME} " >> ${COMPLETE_DIR}/${MANIFEST}
echo "Data transfered ${TOTAL_DATA}" >> ${COMPLETE_DIR}/${MANIFEST}
echo "Books transfered ${TOTAL_BOOKS}" >> ${COMPLETE_DIR}/${MANIFEST}
echo "------------------------------------------------------" >> ${COMPLETE_DIR}/${MANIFEST}
for f in $( ls ${COMPLETE_DIR} | grep -v ${MANIFEST} ); do echo $f >> ${COMPLETE_DIR}/${MANIFEST}; done
echo "------------------------------------------------------"
echo "Start time ${START_TIME}"
echo "Finish time ${END_TIME} "
echo "Data transfered ${TOTAL_DATA}"
echo "Books transfered ${TOTAL_BOOKS}"
echo "------------------------------------------------------"
FINISH=`date +%s`
ELAPSED=`expr $FINISH - $START`
echo "------------------------------------------------------" >> ${COMPLETE_DIR}/${MANIFEST}
echo "Total download time: ${ELAPSED} seconds"
echo "Total download time: ${ELAPSED} seconds" >> ${COMPLETE_DIR}/${MANIFEST}
echo "Files downloaded to: ${COMPLETE_DIR}"
exit 0
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment