Skip to content

Instantly share code, notes, and snippets.

@ingamedeo
Last active April 9, 2024 22:10
Show Gist options
  • Star 27 You must be signed in to star a gist
  • Fork 5 You must be signed in to fork a gist
  • Save ingamedeo/50df5def5bce7edfbd4c6b71aa385328 to your computer and use it in GitHub Desktop.
Save ingamedeo/50df5def5bce7edfbd4c6b71aa385328 to your computer and use it in GitHub Desktop.
web.archive.org downloader
#!/usr/bin/python
# Politecnico di Milano
# date: 22/06/2018
# name: Amedeo Baragiola <amedeo.baragiola@mail.polimi.it>
# description: web.archive.org downloader
import os
import datetime
import requests
import json
import re
website = "@@@DOMAIN@@@";
yr = "2020";
def download_url(url, ts):
try:
original_url = url
url = "http://web.archive.org/web/"+ts+"id_/"+url
print "Downloading "+url+" @ "+ts
r2 = requests.get(url, headers=headers)
url = url.replace("http://", "")
url = url.replace("-", "_")
url = re.sub('[^0-9a-zA-Z]+', '_', url)
file = open(url+".cont.js", "w")
for x,y in r2.headers.items():
file.write(x+": "+y+"\n")
file.write("\n")
file.write(r2.content)
file.close()
file = open(url+".url.txt", "w")
file.write(original_url)
file.close()
except:
print "download_url -> !!CONN_FAILED!! Retrying.."
download_url(original_url, ts)
def get_snapshots(url):
try:
r = requests.get("http://web.archive.org/__wb/calendarcaptures?url="+url+"&selected_year="+yr)
print "Loading snapshots from "+"http://web.archive.org/__wb/calendarcaptures?url="+url+"&selected_year="+yr
data = json.loads(r.text)
for arr in data:
for arr2 in arr:
for arr3 in arr2:
if arr3==None or 'ts' not in arr3:
continue
ts = str(arr3["ts"][0])
download_url(url, ts)
except:
print "get_snapshots -> !!CONN_FAILED!! Retrying.."
get_snapshots(url)
print "web.archive"
r = requests.get('http://web.archive.org/cdx/search?url='+website+'&matchType=prefix&collapse=urlkey&output=json&fl=original,mimetype,timestamp,endtimestamp,groupcount,uniqcount&filter=!statuscode:[45]..&limit=100000&_=1529584262506')
print "Got "+str(r.status_code)
#print r.text
data = json.loads(r.text)
headers = {'User-Agent': 'Mozilla/5.0 (Linux; Android 7.1.1; en-bw; Moto E (4) Build/NCQ26.69-46;) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/55.0.2883.91 Mobile Safari/537.36'}
#Access outer array
for arr in data:
url = arr[0];
if url.startswith("http://"):
print url
get_snapshots(url)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment