Skip to content

Instantly share code, notes, and snippets.

@nscaife
Forked from wcaleb/waybackup.py
Last active August 14, 2017 12:03
Show Gist options
  • Save nscaife/16dfcb7cc9349006cd05ac8bf72e596b to your computer and use it in GitHub Desktop.
Save nscaife/16dfcb7cc9349006cd05ac8bf72e596b to your computer and use it in GitHub Desktop.
Save URLs from file to Wayback Machine
#! /usr/bin/env python
import re
import requests
import time
import urllib.parse
base_url = 'http://web.archive.org'
def get_urls(file):
urls = []
with open(file, 'r') as f:
data = f.read()
urls = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', data)
return urls
urls = set(get_urls('FILE.TXT'))
for url in urls:
print(url)
r = requests.get(base_url + '/save/' + url)
try:
if r.status_code == requests.codes.ok:
print(base_url + r.headers['content-location'])
else:
print('Error in response: ' + str(r.status_code))
except:
print('Exception in response!')
time.sleep(15)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment