jikamens/wayback-check.py

## wayback-check.py
#!/usr/bin/env python3

# This script filters a list of URLs to determine which of them should be
# submitted to the Internet Archive's Wayback Machine for archiving because
# they are valid URLs and/or they aren't already archived in the Wayback
# Machine.
#
# To check for and filter out URLs which are invalid, specify --check on the
# command line. To filter out URLs that are already archived in the Wayback
# Machine, specify --check-archive on the command line. You must specify either
# or both of these options. Also specify on the command line a file name
# containing the list of URLs to check. The list of URLs to keep, i.e., they're
# valid URLs and/or they're not already archived, will be printed to stdout.
#
# Author: Jonathan Kamens <jik@kamens.us>
#
# Copyright 2022 Jonathan Kamens. You can do whatever you want with this script
# as long as you leave this copyright notice intact.

import argparse
import requests
import sys
import time
import urllib.parse

debug_enabled = None
debug_prefix = None


def parse_args():
    global debug_enabled
    parser = argparse.ArgumentParser(description='Check URLs for submission '
                                     'to the Wayback Machine')
    parser.add_argument('--debug', action='store_true', default=False,
                        help='Generate debug output to stderr')
    parser.add_argument('--fetch', action='store_true', default=False,
                        help='Try fetching URLs')
    parser.add_argument('--check-archive', action='store_true', default=False,
                        help='Check if URLs are already archived')
    parser.add_argument('url_list', metavar='URL-LIST', help='File containing '
                        'list of URLs')
    args = parser.parse_args()
    if not (args.fetch or args.check_archive):
        parser.error('Must at least one of --fetch or --check-archive')
    debug_enabled = args.debug
    return args


def debug(*args):
    if debug_enabled:
        if debug_prefix:
            print(debug_prefix, end='', file=sys.stderr)
        print(*args, file=sys.stderr, flush=True)


def backoff(*args, **kwargs):
    sleep_for = 1
    while True:
        response = requests.get(*args, **kwargs)
        if response.status_code != 429:
            return response
        debug(f'Got 429 response, sleeping for {sleep_for}')
        time.sleep(sleep_for)
        sleep_for = min(sleep_for * 2, 60)


def try_url(args, url):
    debug('Trying')
    if args.fetch:
        try:
            debug('Calling HEAD')
            response = requests.head(url, timeout=10)
            debug(f'Response to HEAD is {response}')
            if response.status_code == 405:
                debug('Calling GET')
                response = requests.get(url, timeout=10)
                debug(f'Response to GET is {response}')
            status_code = response.status_code
        except Exception as e:
            debug(f'Fetch exception {repr(e)}, proceeding')
            # Assume intermittent issue
            pass
        else:
            if status_code in (404, 410):
                debug('Returning for known bad status')
                return
            # If the site is going to be obnoxious and return a 403 status code
            # because we're a script, then we're going to be obnoxious back and
            # and assume that the page exists and needs to be archived.
            #
            # Status code 999 seems to be another case of a web server being
            # obnoxious so we'll just treat that as success too.
            if status_code not in (200, 301, 302, 303, 307, 403, 999):
                debug('Returning for not known good status')
                return
    if args.check_archive:
        debug('Checking archive')
        wayback_url = f'https://archive.org/wayback/available?url={url}'
        debug(f'available URL is {wayback_url}')
        try:
            response = requests.get(wayback_url, timeout=10)
            debug(f'Response from endpoint URL is {response}')
            response.raise_for_status()
            try:
                debug(f'Endpoint response JSON is {response.json()}')
            except Exception:
                debug(f'Endpoint response content (not JSON) is '
                      f'{response.content}')
                raise
            next(snapshot
                 for snapshot in
                 response.json()['archived_snapshots'].values()
                 if snapshot.get('available', False))
            debug('Returning for URL in archive')
            return
        except Exception as e:
            debug(f'Archive check exception {repr(e)}, proceeding')
            pass
        # The API endpoint above is unreliable so if it claims the URL isn't
        # in the wayback machine we check again using a more reliable endpoint.
        # We don't _just_ use this endpoint because it's rate-limited so we
        # only want to use it when we have to.
        debug('Available endpoint returned nothing, trying sparkline')
        try:
            wayback_url = (f'https://web.archive.org/__wb/sparkline?'
                           f'output=json&url={urllib.parse.quote(url)}&'
                           f'collection=web')
            debug(f'sparkline URL is {wayback_url}')
            headers = {'Referer': 'https://web.archive.org'}
            response = backoff(wayback_url, headers=headers, timeout=10)
            debug(f'Response from endpoint URL is {response}')
            response.raise_for_status()
            try:
                debug(f'Endpoint response JSON is {response.json()}')
            except Exception:
                debug(f'Endpoint response content (not JSON) is '
                      f'{response.content}')
                raise
            next(iter(response.json()['years']))
            debug('Returning for URL in archive')
            return
        except Exception as e:
            debug(f'Archive check exception {repr(e)}, proceeding')
            pass
    debug('Keeping')
    print(url)


def main():
    args = parse_args()
    global debug_prefix
    for url in open(args.url_list):
        url = url.strip()
        debug_prefix = f'{url}: '
        try_url(args, url)
        if not args.debug:
            print('.', end='', flush=True, file=sys.stderr)
    debug_prefix = None
    if not args.debug:
        print('', file=sys.stderr)


if __name__ == '__main__':
    main()
	#!/usr/bin/env python3

	# This script filters a list of URLs to determine which of them should be
	# submitted to the Internet Archive's Wayback Machine for archiving because
	# they are valid URLs and/or they aren't already archived in the Wayback
	# Machine.
	#
	# To check for and filter out URLs which are invalid, specify --check on the
	# command line. To filter out URLs that are already archived in the Wayback
	# Machine, specify --check-archive on the command line. You must specify either
	# or both of these options. Also specify on the command line a file name
	# containing the list of URLs to check. The list of URLs to keep, i.e., they're
	# valid URLs and/or they're not already archived, will be printed to stdout.
	#
	# Author: Jonathan Kamens <jik@kamens.us>
	#
	# Copyright 2022 Jonathan Kamens. You can do whatever you want with this script
	# as long as you leave this copyright notice intact.

	import argparse
	import requests
	import sys
	import time
	import urllib.parse

	debug_enabled = None
	debug_prefix = None


	def parse_args():
	global debug_enabled
	parser = argparse.ArgumentParser(description='Check URLs for submission '
	'to the Wayback Machine')
	parser.add_argument('--debug', action='store_true', default=False,
	help='Generate debug output to stderr')
	parser.add_argument('--fetch', action='store_true', default=False,
	help='Try fetching URLs')
	parser.add_argument('--check-archive', action='store_true', default=False,
	help='Check if URLs are already archived')
	parser.add_argument('url_list', metavar='URL-LIST', help='File containing '
	'list of URLs')
	args = parser.parse_args()
	if not (args.fetch or args.check_archive):
	parser.error('Must at least one of --fetch or --check-archive')
	debug_enabled = args.debug
	return args


	def debug(*args):
	if debug_enabled:
	if debug_prefix:
	print(debug_prefix, end='', file=sys.stderr)
	print(*args, file=sys.stderr, flush=True)


	def backoff(args, *kwargs):
	sleep_for = 1
	while True:
	response = requests.get(args, *kwargs)
	if response.status_code != 429:
	return response
	debug(f'Got 429 response, sleeping for {sleep_for}')
	time.sleep(sleep_for)
	sleep_for = min(sleep_for * 2, 60)


	def try_url(args, url):
	debug('Trying')
	if args.fetch:
	try:
	debug('Calling HEAD')
	response = requests.head(url, timeout=10)
	debug(f'Response to HEAD is {response}')
	if response.status_code == 405:
	debug('Calling GET')
	response = requests.get(url, timeout=10)
	debug(f'Response to GET is {response}')
	status_code = response.status_code
	except Exception as e:
	debug(f'Fetch exception {repr(e)}, proceeding')
	# Assume intermittent issue
	pass
	else:
	if status_code in (404, 410):
	debug('Returning for known bad status')
	return
	# If the site is going to be obnoxious and return a 403 status code
	# because we're a script, then we're going to be obnoxious back and
	# and assume that the page exists and needs to be archived.
	#
	# Status code 999 seems to be another case of a web server being
	# obnoxious so we'll just treat that as success too.
	if status_code not in (200, 301, 302, 303, 307, 403, 999):
	debug('Returning for not known good status')
	return
	if args.check_archive:
	debug('Checking archive')
	wayback_url = f'https://archive.org/wayback/available?url={url}'
	debug(f'available URL is {wayback_url}')
	try:
	response = requests.get(wayback_url, timeout=10)
	debug(f'Response from endpoint URL is {response}')
	response.raise_for_status()
	try:
	debug(f'Endpoint response JSON is {response.json()}')
	except Exception:
	debug(f'Endpoint response content (not JSON) is '
	f'{response.content}')
	raise
	next(snapshot
	for snapshot in
	response.json()['archived_snapshots'].values()
	if snapshot.get('available', False))
	debug('Returning for URL in archive')
	return
	except Exception as e:
	debug(f'Archive check exception {repr(e)}, proceeding')
	pass
	# The API endpoint above is unreliable so if it claims the URL isn't
	# in the wayback machine we check again using a more reliable endpoint.
	# We don't _just_ use this endpoint because it's rate-limited so we
	# only want to use it when we have to.
	debug('Available endpoint returned nothing, trying sparkline')
	try:
	wayback_url = (f'https://web.archive.org/__wb/sparkline?'
	f'output=json&url={urllib.parse.quote(url)}&'
	f'collection=web')
	debug(f'sparkline URL is {wayback_url}')
	headers = {'Referer': 'https://web.archive.org'}
	response = backoff(wayback_url, headers=headers, timeout=10)
	debug(f'Response from endpoint URL is {response}')
	response.raise_for_status()
	try:
	debug(f'Endpoint response JSON is {response.json()}')
	except Exception:
	debug(f'Endpoint response content (not JSON) is '
	f'{response.content}')
	raise
	next(iter(response.json()['years']))
	debug('Returning for URL in archive')
	return
	except Exception as e:
	debug(f'Archive check exception {repr(e)}, proceeding')
	pass
	debug('Keeping')
	print(url)


	def main():
	args = parse_args()
	global debug_prefix
	for url in open(args.url_list):
	url = url.strip()
	debug_prefix = f'{url}: '
	try_url(args, url)
	if not args.debug:
	print('.', end='', flush=True, file=sys.stderr)
	debug_prefix = None
	if not args.debug:
	print('', file=sys.stderr)


	if __name__ == '__main__':
	main()