Last active
December 24, 2022 16:10
-
-
Save jikamens/ccbddf7ee20e1c0f326926a5eb1f46c1 to your computer and use it in GitHub Desktop.
Script for filtering URLs before submitting them to the Wayback Machine for archiving
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# This script filters a list of URLs to determine which of them should be | |
# submitted to the Internet Archive's Wayback Machine for archiving because | |
# they are valid URLs and/or they aren't already archived in the Wayback | |
# Machine. | |
# | |
# To check for and filter out URLs which are invalid, specify --check on the | |
# command line. To filter out URLs that are already archived in the Wayback | |
# Machine, specify --check-archive on the command line. You must specify either | |
# or both of these options. Also specify on the command line a file name | |
# containing the list of URLs to check. The list of URLs to keep, i.e., they're | |
# valid URLs and/or they're not already archived, will be printed to stdout. | |
# | |
# Author: Jonathan Kamens <jik@kamens.us> | |
# | |
# Copyright 2022 Jonathan Kamens. You can do whatever you want with this script | |
# as long as you leave this copyright notice intact. | |
import argparse | |
import requests | |
import sys | |
import time | |
import urllib.parse | |
debug_enabled = None | |
debug_prefix = None | |
def parse_args(): | |
global debug_enabled | |
parser = argparse.ArgumentParser(description='Check URLs for submission ' | |
'to the Wayback Machine') | |
parser.add_argument('--debug', action='store_true', default=False, | |
help='Generate debug output to stderr') | |
parser.add_argument('--fetch', action='store_true', default=False, | |
help='Try fetching URLs') | |
parser.add_argument('--check-archive', action='store_true', default=False, | |
help='Check if URLs are already archived') | |
parser.add_argument('url_list', metavar='URL-LIST', help='File containing ' | |
'list of URLs') | |
args = parser.parse_args() | |
if not (args.fetch or args.check_archive): | |
parser.error('Must at least one of --fetch or --check-archive') | |
debug_enabled = args.debug | |
return args | |
def debug(*args): | |
if debug_enabled: | |
if debug_prefix: | |
print(debug_prefix, end='', file=sys.stderr) | |
print(*args, file=sys.stderr, flush=True) | |
def backoff(*args, **kwargs): | |
sleep_for = 1 | |
while True: | |
response = requests.get(*args, **kwargs) | |
if response.status_code != 429: | |
return response | |
debug(f'Got 429 response, sleeping for {sleep_for}') | |
time.sleep(sleep_for) | |
sleep_for = min(sleep_for * 2, 60) | |
def try_url(args, url): | |
debug('Trying') | |
if args.fetch: | |
try: | |
debug('Calling HEAD') | |
response = requests.head(url, timeout=10) | |
debug(f'Response to HEAD is {response}') | |
if response.status_code == 405: | |
debug('Calling GET') | |
response = requests.get(url, timeout=10) | |
debug(f'Response to GET is {response}') | |
status_code = response.status_code | |
except Exception as e: | |
debug(f'Fetch exception {repr(e)}, proceeding') | |
# Assume intermittent issue | |
pass | |
else: | |
if status_code in (404, 410): | |
debug('Returning for known bad status') | |
return | |
# If the site is going to be obnoxious and return a 403 status code | |
# because we're a script, then we're going to be obnoxious back and | |
# and assume that the page exists and needs to be archived. | |
# | |
# Status code 999 seems to be another case of a web server being | |
# obnoxious so we'll just treat that as success too. | |
if status_code not in (200, 301, 302, 303, 307, 403, 999): | |
debug('Returning for not known good status') | |
return | |
if args.check_archive: | |
debug('Checking archive') | |
wayback_url = f'https://archive.org/wayback/available?url={url}' | |
debug(f'available URL is {wayback_url}') | |
try: | |
response = requests.get(wayback_url, timeout=10) | |
debug(f'Response from endpoint URL is {response}') | |
response.raise_for_status() | |
try: | |
debug(f'Endpoint response JSON is {response.json()}') | |
except Exception: | |
debug(f'Endpoint response content (not JSON) is ' | |
f'{response.content}') | |
raise | |
next(snapshot | |
for snapshot in | |
response.json()['archived_snapshots'].values() | |
if snapshot.get('available', False)) | |
debug('Returning for URL in archive') | |
return | |
except Exception as e: | |
debug(f'Archive check exception {repr(e)}, proceeding') | |
pass | |
# The API endpoint above is unreliable so if it claims the URL isn't | |
# in the wayback machine we check again using a more reliable endpoint. | |
# We don't _just_ use this endpoint because it's rate-limited so we | |
# only want to use it when we have to. | |
debug('Available endpoint returned nothing, trying sparkline') | |
try: | |
wayback_url = (f'https://web.archive.org/__wb/sparkline?' | |
f'output=json&url={urllib.parse.quote(url)}&' | |
f'collection=web') | |
debug(f'sparkline URL is {wayback_url}') | |
headers = {'Referer': 'https://web.archive.org'} | |
response = backoff(wayback_url, headers=headers, timeout=10) | |
debug(f'Response from endpoint URL is {response}') | |
response.raise_for_status() | |
try: | |
debug(f'Endpoint response JSON is {response.json()}') | |
except Exception: | |
debug(f'Endpoint response content (not JSON) is ' | |
f'{response.content}') | |
raise | |
next(iter(response.json()['years'])) | |
debug('Returning for URL in archive') | |
return | |
except Exception as e: | |
debug(f'Archive check exception {repr(e)}, proceeding') | |
pass | |
debug('Keeping') | |
print(url) | |
def main(): | |
args = parse_args() | |
global debug_prefix | |
for url in open(args.url_list): | |
url = url.strip() | |
debug_prefix = f'{url}: ' | |
try_url(args, url) | |
if not args.debug: | |
print('.', end='', flush=True, file=sys.stderr) | |
debug_prefix = None | |
if not args.debug: | |
print('', file=sys.stderr) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment