-
-
Save mildred/7a33bb9c263f025b59e6 to your computer and use it in GitHub Desktop.
#!/bin/bash | |
url=http://redefininggod.com | |
webarchive=https://web.archive.org | |
wget="wget -e robots=off -nv" | |
tab="$(printf '\t')" | |
additional_url=url.list | |
# Construct listing.txt from url.list | |
# The list of archived pages, including some wildcard url | |
# each line contains some fields separated by tabs: | |
# - the last capture date (opaque format, if different, the last year index file | |
# will be redownloaded) | |
# - the first capture year (hint for which is the oldest index to query) | |
# - the last capture year (hint for which is the latest index to query) | |
# - the url, starting with "/web/YYYYMMDDHHMMSS/" or "/web/*" (only) | |
: >listing.txt | |
# Add url.list to listing.txt | |
while read url; do | |
if [[ -z "$url" ]]; then continue; fi | |
if [[ $url != ${url#http*/web.archive.org} ]]; then | |
url="${url#http*/web.archive.org}" | |
elif [[ $url != ${url%/\*} ]]; then | |
mkdir -p "$(dirname "./web/*/$url")" | |
$wget "$webarchive/web/*/$url" -O "./web/*/$url.html" | |
# <listing.html fgrep 'href="/web/' | cut -d'"' -f2 >listing.txt | |
<"./web/*/$url.html" sed -r -e ' | |
/<table id="resultsUrl">/,/<\/table>/ { | |
/a href/ { | |
s/.*href="(.*)".*/\1/; | |
h | |
}; | |
/dateFrom/ { | |
s/.*([0-9]{4})<\/td>.*/\1/; | |
x; | |
H | |
}; | |
/dateTo/ { | |
s/.*>(.*)([0-9]{4})<\/td>.*/\1\2\n\2/; | |
x; | |
H | |
}; | |
/<\/tr>/ { | |
x; | |
s/(.*)\n(.*)\n(.*)\n(.*)/\1\t\3\t\2\t\4/; | |
p | |
} | |
}; | |
d' >"./web/*/$url.txt" | |
cat "./web/*/$url.txt" >>listing.txt | |
continue | |
else | |
url="/web/*/$url" | |
fi | |
printf "%s\t%s\t%s\t%s\n" "$(date)" 1996 2014 "$url" >>listing.txt | |
done <"$additional_url" | |
# Construct listing2.txt | |
# Remove the wildcard url and fetch all the versions from index | |
# Lines only contains the URL starting with "/web/YYYYMMDDHHMMSS/" (only) | |
# It may contains duplicates | |
: >listing2.txt | |
while read line; do | |
if [[ -z "$line" ]]; then continue; fi | |
#printf "%s\n" "$line" | |
oldifs="$IFS" | |
IFS="$tab" elems=($line) | |
IFS="$oldifs" | |
lastcap="${elems[0]}" | |
firstyear="${elems[1]}" | |
lastyear="${elems[2]}" | |
mainurl="${elems[3]}" | |
#echo "Main URL: $firstyear->$lastyear $mainurl" | |
if [[ $mainurl =~ '/web/*/' ]]; then | |
listing="./$mainurl.txt" | |
mkdir -p "$(dirname "$listing")" | |
: >"$listing" | |
oldlastcap="$(cat "./$mainurl.lastcap.txt" 2>/dev/null)" | |
oldlastyear="$(cat "./$mainurl.lastyear.txt" 2>/dev/null)" | |
: ${oldlastyear:=$lastyear} | |
for y in $(seq $firstyear $lastyear); do | |
u="/web/${y}0101000000*/${mainurl#/web/*/}" | |
mkdir -p "$(dirname "./$u.html")" | |
if ! [[ -s "./$u.html" ]] || ([[ $y -ge $oldlastyear ]] && [[ $lastcap != $oldlastcap ]]) ; then | |
$wget "$webarchive$u" -O "./$u.html" | |
fi | |
#<"./$u.html" egrep 'href="/web/[0-9]+\*' | sed -r 's/.*href="([^"]*)".*/\1/' >"$d/$f.txt" | |
<"./$u.html" egrep 'href="/web/[0-9]*/' | sed -r 's/.*href="([^"]*)".*/\1/' >>"$listing" | |
done | |
printf %s "$lastcap" >"./$mainurl.lastcap.txt" | |
printf %s "$lastyear" >"./$mainurl.lastyear.txt" | |
<"$listing" | sort | uniq >>listing2.txt | |
else | |
echo "$mainurl" >>listing2.txt | |
fi | |
done <listing.txt | |
# Construct listing3.txt | |
# sort, uniq, use unmodified page appending id_ to the timestamp | |
# URL must start with "/web/YYYYMMDDHHMMSSid_/" only. | |
# This is the list of files that needs to be downloaded (if not already present) | |
<listing2.txt sort | uniq | sed -r 's:^/web/([0-9]*)/:/web/\1id_/:' >listing3.txt | |
# Download listing3 | |
while read url; do | |
if [[ $url != ${url%/} ]]; then | |
f="./$url/.index" | |
else | |
f="./$url" | |
fi | |
mkdir -p "$(dirname "$f")" | |
if ! [[ -s "$f" ]]; then | |
$wget "$webarchive$url" -O "./$f" | |
fi | |
done <listing3.txt |
Didn't managed to get it to work, ended up coding a small gem in Ruby: https://github.com/hartator/wayback-machine-downloader
./download.sh: line 62: url.list: No such file or directory
For those having problems like "./download.sh: line 62: url.list: No such file or directory" go to lines 21 and 62 and comment them out by placing a # character in front of them. Make sure you edit line 3 to point to your domain before you run the command.
However, I think this only manages to pull the indexes (calendars) not the actual archived pages.
you can do all this in just a few lines by using the api:
#!/bin/bash
url= #your url
wget "http://web.archive.org/cdx/search/cdx?url=${url}*&output=json&fl=original,timestamp" -O out.json
sed -Eni '2,$s%^\["([^"]*)","([^"]*)"](,|])$%wget "https://web.archive.org/web/\2id_/\1" -O \2.html%gmp' out.json
cat out.json | sh
if you don't care about the filenames being dumb, wget will reuse the same connection for all the urls in a list, making it faster:
#!/bin/bash
url= #your url
wget "http://web.archive.org/cdx/search/cdx?url=${url}*&output=json&fl=original,timestamp" -O out.json
sed -Eni '2,$s%^\["([^"]*)","([^"]*)"](,|])$%https://web.archive.org/web/\2id_/\1%gmp' out.json
wget -i out.json
you can even filter the results and such, see the docs
C:\Users\COMPUTER\Desktop\566\download.sh: syntax error near unexpected token
e lems=($' C:\Users\COMPUTER\Desktop\566\download.sh: C:\Users\COMPUTER\Desktop\566\downloa d.sh: line 77:
IFS="$tab" elems=($line)'bash$ cd
bash$