Skip to content

Instantly share code, notes, and snippets.

@jikamens
Last active December 24, 2022 16:10
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jikamens/ccbddf7ee20e1c0f326926a5eb1f46c1 to your computer and use it in GitHub Desktop.
Save jikamens/ccbddf7ee20e1c0f326926a5eb1f46c1 to your computer and use it in GitHub Desktop.
Script for filtering URLs before submitting them to the Wayback Machine for archiving
#!/usr/bin/env python3
# This script filters a list of URLs to determine which of them should be
# submitted to the Internet Archive's Wayback Machine for archiving because
# they are valid URLs and/or they aren't already archived in the Wayback
# Machine.
#
# To check for and filter out URLs which are invalid, specify --check on the
# command line. To filter out URLs that are already archived in the Wayback
# Machine, specify --check-archive on the command line. You must specify either
# or both of these options. Also specify on the command line a file name
# containing the list of URLs to check. The list of URLs to keep, i.e., they're
# valid URLs and/or they're not already archived, will be printed to stdout.
#
# Author: Jonathan Kamens <jik@kamens.us>
#
# Copyright 2022 Jonathan Kamens. You can do whatever you want with this script
# as long as you leave this copyright notice intact.
import argparse
import requests
import sys
import time
import urllib.parse
debug_enabled = None
debug_prefix = None
def parse_args():
global debug_enabled
parser = argparse.ArgumentParser(description='Check URLs for submission '
'to the Wayback Machine')
parser.add_argument('--debug', action='store_true', default=False,
help='Generate debug output to stderr')
parser.add_argument('--fetch', action='store_true', default=False,
help='Try fetching URLs')
parser.add_argument('--check-archive', action='store_true', default=False,
help='Check if URLs are already archived')
parser.add_argument('url_list', metavar='URL-LIST', help='File containing '
'list of URLs')
args = parser.parse_args()
if not (args.fetch or args.check_archive):
parser.error('Must at least one of --fetch or --check-archive')
debug_enabled = args.debug
return args
def debug(*args):
if debug_enabled:
if debug_prefix:
print(debug_prefix, end='', file=sys.stderr)
print(*args, file=sys.stderr, flush=True)
def backoff(*args, **kwargs):
sleep_for = 1
while True:
response = requests.get(*args, **kwargs)
if response.status_code != 429:
return response
debug(f'Got 429 response, sleeping for {sleep_for}')
time.sleep(sleep_for)
sleep_for = min(sleep_for * 2, 60)
def try_url(args, url):
debug('Trying')
if args.fetch:
try:
debug('Calling HEAD')
response = requests.head(url, timeout=10)
debug(f'Response to HEAD is {response}')
if response.status_code == 405:
debug('Calling GET')
response = requests.get(url, timeout=10)
debug(f'Response to GET is {response}')
status_code = response.status_code
except Exception as e:
debug(f'Fetch exception {repr(e)}, proceeding')
# Assume intermittent issue
pass
else:
if status_code in (404, 410):
debug('Returning for known bad status')
return
# If the site is going to be obnoxious and return a 403 status code
# because we're a script, then we're going to be obnoxious back and
# and assume that the page exists and needs to be archived.
#
# Status code 999 seems to be another case of a web server being
# obnoxious so we'll just treat that as success too.
if status_code not in (200, 301, 302, 303, 307, 403, 999):
debug('Returning for not known good status')
return
if args.check_archive:
debug('Checking archive')
wayback_url = f'https://archive.org/wayback/available?url={url}'
debug(f'available URL is {wayback_url}')
try:
response = requests.get(wayback_url, timeout=10)
debug(f'Response from endpoint URL is {response}')
response.raise_for_status()
try:
debug(f'Endpoint response JSON is {response.json()}')
except Exception:
debug(f'Endpoint response content (not JSON) is '
f'{response.content}')
raise
next(snapshot
for snapshot in
response.json()['archived_snapshots'].values()
if snapshot.get('available', False))
debug('Returning for URL in archive')
return
except Exception as e:
debug(f'Archive check exception {repr(e)}, proceeding')
pass
# The API endpoint above is unreliable so if it claims the URL isn't
# in the wayback machine we check again using a more reliable endpoint.
# We don't _just_ use this endpoint because it's rate-limited so we
# only want to use it when we have to.
debug('Available endpoint returned nothing, trying sparkline')
try:
wayback_url = (f'https://web.archive.org/__wb/sparkline?'
f'output=json&url={urllib.parse.quote(url)}&'
f'collection=web')
debug(f'sparkline URL is {wayback_url}')
headers = {'Referer': 'https://web.archive.org'}
response = backoff(wayback_url, headers=headers, timeout=10)
debug(f'Response from endpoint URL is {response}')
response.raise_for_status()
try:
debug(f'Endpoint response JSON is {response.json()}')
except Exception:
debug(f'Endpoint response content (not JSON) is '
f'{response.content}')
raise
next(iter(response.json()['years']))
debug('Returning for URL in archive')
return
except Exception as e:
debug(f'Archive check exception {repr(e)}, proceeding')
pass
debug('Keeping')
print(url)
def main():
args = parse_args()
global debug_prefix
for url in open(args.url_list):
url = url.strip()
debug_prefix = f'{url}: '
try_url(args, url)
if not args.debug:
print('.', end='', flush=True, file=sys.stderr)
debug_prefix = None
if not args.debug:
print('', file=sys.stderr)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment