Skip to content

Instantly share code, notes, and snippets.

@ingamedeo ingamedeo/webarchive.py Secret
Last active Feb 4, 2019

Embed
What would you like to do?
web.archive.org downloader
#!/usr/bin/python
import os
import datetime
import requests
import json
import re
def download_url(url, ts):
try:
original_url = url
url = "http://web.archive.org/web/"+ts+"id_/"+url
print "Downloading "+url+" @ "+ts
r2 = requests.get(url, headers=headers)
url = url.replace("http://", "")
url = url.replace("-", "_")
url = re.sub('[^0-9a-zA-Z]+', '_', url)
file = open(url+".cont.js", "w")
for x,y in r2.headers.items():
file.write(x+": "+y+"\n")
file.write("\n")
file.write(r2.content)
file.close()
file = open(url+".url.txt", "w")
file.write(original_url)
file.close()
except:
print "download_url -> !!CONN_FAILED!! Retrying.."
download_url(original_url, ts)
def get_snapshots(url):
try:
r = requests.get("http://web.archive.org/__wb/calendarcaptures?url="+url+"&selected_year=####")
print "Loading snapshots from "+"http://web.archive.org/__wb/calendarcaptures?url="+url+"&selected_year=####"
data = json.loads(r.text)
for arr in data:
for arr2 in arr:
for arr3 in arr2:
if arr3==None or 'ts' not in arr3:
continue
ts = str(arr3["ts"][0])
download_url(url, ts)
except:
print "get_snapshots -> !!CONN_FAILED!! Retrying.."
get_snapshots(url)
print "web.archive"
r = requests.get('http://web.archive.org/cdx/search?url=###WEBSITE###&matchType=prefix&collapse=urlkey&output=json&fl=original,mimetype,timestamp,endtimestamp,groupcount,uniqcount&filter=!statuscode:[45]..&limit=100000&_=1529584262506')
print "Got "+str(r.status_code)
#print r.text
data = json.loads(r.text)
headers = {'User-Agent': 'Mozilla/5.0 (Linux; Android 7.1.1; en-bw; Moto E (4) Build/NCQ26.69-46;) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/55.0.2883.91 Mobile Safari/537.36'}
#Access outer array
for arr in data:
url = arr[0];
if url.startswith("http://"):
print url
get_snapshots(url)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.