Created
October 20, 2014 10:03
-
-
Save mildred/7a33bb9c263f025b59e6 to your computer and use it in GitHub Desktop.
Download from archive.org Wayback Machine
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
url=http://redefininggod.com | |
webarchive=https://web.archive.org | |
wget="wget -e robots=off -nv" | |
tab="$(printf '\t')" | |
additional_url=url.list | |
# Construct listing.txt from url.list | |
# The list of archived pages, including some wildcard url | |
# each line contains some fields separated by tabs: | |
# - the last capture date (opaque format, if different, the last year index file | |
# will be redownloaded) | |
# - the first capture year (hint for which is the oldest index to query) | |
# - the last capture year (hint for which is the latest index to query) | |
# - the url, starting with "/web/YYYYMMDDHHMMSS/" or "/web/*" (only) | |
: >listing.txt | |
# Add url.list to listing.txt | |
while read url; do | |
if [[ -z "$url" ]]; then continue; fi | |
if [[ $url != ${url#http*/web.archive.org} ]]; then | |
url="${url#http*/web.archive.org}" | |
elif [[ $url != ${url%/\*} ]]; then | |
mkdir -p "$(dirname "./web/*/$url")" | |
$wget "$webarchive/web/*/$url" -O "./web/*/$url.html" | |
# <listing.html fgrep 'href="/web/' | cut -d'"' -f2 >listing.txt | |
<"./web/*/$url.html" sed -r -e ' | |
/<table id="resultsUrl">/,/<\/table>/ { | |
/a href/ { | |
s/.*href="(.*)".*/\1/; | |
h | |
}; | |
/dateFrom/ { | |
s/.*([0-9]{4})<\/td>.*/\1/; | |
x; | |
H | |
}; | |
/dateTo/ { | |
s/.*>(.*)([0-9]{4})<\/td>.*/\1\2\n\2/; | |
x; | |
H | |
}; | |
/<\/tr>/ { | |
x; | |
s/(.*)\n(.*)\n(.*)\n(.*)/\1\t\3\t\2\t\4/; | |
p | |
} | |
}; | |
d' >"./web/*/$url.txt" | |
cat "./web/*/$url.txt" >>listing.txt | |
continue | |
else | |
url="/web/*/$url" | |
fi | |
printf "%s\t%s\t%s\t%s\n" "$(date)" 1996 2014 "$url" >>listing.txt | |
done <"$additional_url" | |
# Construct listing2.txt | |
# Remove the wildcard url and fetch all the versions from index | |
# Lines only contains the URL starting with "/web/YYYYMMDDHHMMSS/" (only) | |
# It may contains duplicates | |
: >listing2.txt | |
while read line; do | |
if [[ -z "$line" ]]; then continue; fi | |
#printf "%s\n" "$line" | |
oldifs="$IFS" | |
IFS="$tab" elems=($line) | |
IFS="$oldifs" | |
lastcap="${elems[0]}" | |
firstyear="${elems[1]}" | |
lastyear="${elems[2]}" | |
mainurl="${elems[3]}" | |
#echo "Main URL: $firstyear->$lastyear $mainurl" | |
if [[ $mainurl =~ '/web/*/' ]]; then | |
listing="./$mainurl.txt" | |
mkdir -p "$(dirname "$listing")" | |
: >"$listing" | |
oldlastcap="$(cat "./$mainurl.lastcap.txt" 2>/dev/null)" | |
oldlastyear="$(cat "./$mainurl.lastyear.txt" 2>/dev/null)" | |
: ${oldlastyear:=$lastyear} | |
for y in $(seq $firstyear $lastyear); do | |
u="/web/${y}0101000000*/${mainurl#/web/*/}" | |
mkdir -p "$(dirname "./$u.html")" | |
if ! [[ -s "./$u.html" ]] || ([[ $y -ge $oldlastyear ]] && [[ $lastcap != $oldlastcap ]]) ; then | |
$wget "$webarchive$u" -O "./$u.html" | |
fi | |
#<"./$u.html" egrep 'href="/web/[0-9]+\*' | sed -r 's/.*href="([^"]*)".*/\1/' >"$d/$f.txt" | |
<"./$u.html" egrep 'href="/web/[0-9]*/' | sed -r 's/.*href="([^"]*)".*/\1/' >>"$listing" | |
done | |
printf %s "$lastcap" >"./$mainurl.lastcap.txt" | |
printf %s "$lastyear" >"./$mainurl.lastyear.txt" | |
<"$listing" | sort | uniq >>listing2.txt | |
else | |
echo "$mainurl" >>listing2.txt | |
fi | |
done <listing.txt | |
# Construct listing3.txt | |
# sort, uniq, use unmodified page appending id_ to the timestamp | |
# URL must start with "/web/YYYYMMDDHHMMSSid_/" only. | |
# This is the list of files that needs to be downloaded (if not already present) | |
<listing2.txt sort | uniq | sed -r 's:^/web/([0-9]*)/:/web/\1id_/:' >listing3.txt | |
# Download listing3 | |
while read url; do | |
if [[ $url != ${url%/} ]]; then | |
f="./$url/.index" | |
else | |
f="./$url" | |
fi | |
mkdir -p "$(dirname "$f")" | |
if ! [[ -s "$f" ]]; then | |
$wget "$webarchive$url" -O "./$f" | |
fi | |
done <listing3.txt |
For those having problems like "./download.sh: line 62: url.list: No such file or directory" go to lines 21 and 62 and comment them out by placing a # character in front of them. Make sure you edit line 3 to point to your domain before you run the command.
However, I think this only manages to pull the indexes (calendars) not the actual archived pages.
you can do all this in just a few lines by using the api:
#!/bin/bash
url= #your url
wget "http://web.archive.org/cdx/search/cdx?url=${url}*&output=json&fl=original,timestamp" -O out.json
sed -Eni '2,$s%^\["([^"]*)","([^"]*)"](,|])$%wget "https://web.archive.org/web/\2id_/\1" -O \2.html%gmp' out.json
cat out.json | sh
if you don't care about the filenames being dumb, wget will reuse the same connection for all the urls in a list, making it faster:
#!/bin/bash
url= #your url
wget "http://web.archive.org/cdx/search/cdx?url=${url}*&output=json&fl=original,timestamp" -O out.json
sed -Eni '2,$s%^\["([^"]*)","([^"]*)"](,|])$%https://web.archive.org/web/\2id_/\1%gmp' out.json
wget -i out.json
you can even filter the results and such, see the docs
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
./download.sh: line 62: url.list: No such file or directory