Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Download from archive.org Wayback Machine
#!/bin/bash
url=http://redefininggod.com
webarchive=https://web.archive.org
wget="wget -e robots=off -nv"
tab="$(printf '\t')"
additional_url=url.list
# Construct listing.txt from url.list
# The list of archived pages, including some wildcard url
# each line contains some fields separated by tabs:
# - the last capture date (opaque format, if different, the last year index file
# will be redownloaded)
# - the first capture year (hint for which is the oldest index to query)
# - the last capture year (hint for which is the latest index to query)
# - the url, starting with "/web/YYYYMMDDHHMMSS/" or "/web/*" (only)
: >listing.txt
# Add url.list to listing.txt
while read url; do
if [[ -z "$url" ]]; then continue; fi
if [[ $url != ${url#http*/web.archive.org} ]]; then
url="${url#http*/web.archive.org}"
elif [[ $url != ${url%/\*} ]]; then
mkdir -p "$(dirname "./web/*/$url")"
$wget "$webarchive/web/*/$url" -O "./web/*/$url.html"
# <listing.html fgrep 'href="/web/' | cut -d'"' -f2 >listing.txt
<"./web/*/$url.html" sed -r -e '
/<table id="resultsUrl">/,/<\/table>/ {
/a href/ {
s/.*href="(.*)".*/\1/;
h
};
/dateFrom/ {
s/.*([0-9]{4})<\/td>.*/\1/;
x;
H
};
/dateTo/ {
s/.*>(.*)([0-9]{4})<\/td>.*/\1\2\n\2/;
x;
H
};
/<\/tr>/ {
x;
s/(.*)\n(.*)\n(.*)\n(.*)/\1\t\3\t\2\t\4/;
p
}
};
d' >"./web/*/$url.txt"
cat "./web/*/$url.txt" >>listing.txt
continue
else
url="/web/*/$url"
fi
printf "%s\t%s\t%s\t%s\n" "$(date)" 1996 2014 "$url" >>listing.txt
done <"$additional_url"
# Construct listing2.txt
# Remove the wildcard url and fetch all the versions from index
# Lines only contains the URL starting with "/web/YYYYMMDDHHMMSS/" (only)
# It may contains duplicates
: >listing2.txt
while read line; do
if [[ -z "$line" ]]; then continue; fi
#printf "%s\n" "$line"
oldifs="$IFS"
IFS="$tab" elems=($line)
IFS="$oldifs"
lastcap="${elems[0]}"
firstyear="${elems[1]}"
lastyear="${elems[2]}"
mainurl="${elems[3]}"
#echo "Main URL: $firstyear->$lastyear $mainurl"
if [[ $mainurl =~ '/web/*/' ]]; then
listing="./$mainurl.txt"
mkdir -p "$(dirname "$listing")"
: >"$listing"
oldlastcap="$(cat "./$mainurl.lastcap.txt" 2>/dev/null)"
oldlastyear="$(cat "./$mainurl.lastyear.txt" 2>/dev/null)"
: ${oldlastyear:=$lastyear}
for y in $(seq $firstyear $lastyear); do
u="/web/${y}0101000000*/${mainurl#/web/*/}"
mkdir -p "$(dirname "./$u.html")"
if ! [[ -s "./$u.html" ]] || ([[ $y -ge $oldlastyear ]] && [[ $lastcap != $oldlastcap ]]) ; then
$wget "$webarchive$u" -O "./$u.html"
fi
#<"./$u.html" egrep 'href="/web/[0-9]+\*' | sed -r 's/.*href="([^"]*)".*/\1/' >"$d/$f.txt"
<"./$u.html" egrep 'href="/web/[0-9]*/' | sed -r 's/.*href="([^"]*)".*/\1/' >>"$listing"
done
printf %s "$lastcap" >"./$mainurl.lastcap.txt"
printf %s "$lastyear" >"./$mainurl.lastyear.txt"
<"$listing" | sort | uniq >>listing2.txt
else
echo "$mainurl" >>listing2.txt
fi
done <listing.txt
# Construct listing3.txt
# sort, uniq, use unmodified page appending id_ to the timestamp
# URL must start with "/web/YYYYMMDDHHMMSSid_/" only.
# This is the list of files that needs to be downloaded (if not already present)
<listing2.txt sort | uniq | sed -r 's:^/web/([0-9]*)/:/web/\1id_/:' >listing3.txt
# Download listing3
while read url; do
if [[ $url != ${url%/} ]]; then
f="./$url/.index"
else
f="./$url"
fi
mkdir -p "$(dirname "$f")"
if ! [[ -s "$f" ]]; then
$wget "$webarchive$url" -O "./$f"
fi
done <listing3.txt
@rudolphos

This comment has been minimized.

Copy link

rudolphos commented May 11, 2015

C:\Users\COMPUTER\Desktop\566\download.sh: syntax error near unexpected token e lems=($' C:\Users\COMPUTER\Desktop\566\download.sh: C:\Users\COMPUTER\Desktop\566\downloa d.sh: line 77: IFS="$tab" elems=($line)'
bash$ cd
bash$

@hartator

This comment has been minimized.

Copy link

hartator commented Aug 10, 2015

Didn't managed to get it to work, ended up coding a small gem in Ruby: https://github.com/hartator/wayback-machine-downloader

@isuvorov

This comment has been minimized.

Copy link

isuvorov commented Aug 12, 2015

./download.sh: line 62: url.list: No such file or directory

@insaner

This comment has been minimized.

Copy link

insaner commented Nov 1, 2016

For those having problems like "./download.sh: line 62: url.list: No such file or directory" go to lines 21 and 62 and comment them out by placing a # character in front of them. Make sure you edit line 3 to point to your domain before you run the command.

However, I think this only manages to pull the indexes (calendars) not the actual archived pages.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.