Skip to content

Instantly share code, notes, and snippets.

@YoRyan
Created January 3, 2022 21:07
Show Gist options
  • Save YoRyan/5033957efe5aff22bc0d9c4645061a5f to your computer and use it in GitHub Desktop.
Save YoRyan/5033957efe5aff22bc0d9c4645061a5f to your computer and use it in GitHub Desktop.
Use the Wayback Machine API to search old websites.
#!/usr/bin/env python3
# -*- coding: utf8 -*-
from argparse import ArgumentTypeError, ArgumentParser
from collections import Counter
from datetime import datetime, timezone
from pathlib import Path
from re import search
from archivecdx import Listing as ArchiveListing
def main():
def input_dt(s):
try:
dt = datetime.fromisoformat(s)
except ValueError:
raise ArgumentTypeError(f'invalid date: {s}')
if dt.tzinfo:
return dt
else:
return dt.astimezone()
parser = ArgumentParser(
description='Resurrect websites from the Wayback Machine.')
parser.add_argument('url', help='URL parameter to search')
parser.add_argument('-matchType', default='exact',
choices=['exact', 'prefix', 'host', 'domain'],
help='select scope for search')
parser.add_argument('-from', dest='tfrom', type=input_dt,
help='start search from this date')
parser.add_argument('-to', dest='tto', type=input_dt,
help='end search at this date')
parser.add_argument('-filter', action='append',
help='apply a search filter; add multiple for AND')
parser.add_argument('-collapse', action='append',
help='apply a collapse filter; add multiple for AND')
args = parser.parse_args()
def wayback_time(dt): return dt.astimezone(timezone.utc).strftime('%Y%m%d%H%M%S')
download_listing(ArchiveListing(
args.url,
matchType=args.matchType,
filter=args.filter if args.filter else [],
collapse=args.collapse if args.collapse else [],
_from=wayback_time(args.tfrom) if args.tfrom else None,
to=wayback_time(args.tto) if args.tto else None))
def download_listing(listing):
def urlkey_path(urlkey):
parts = Path(urlkey).parts
def domain(part):
m = search(r'(.+?)(?::(\d+))?\)$', part)
if not m:
raise ValueError(f'invalid domain: {part}')
domainl = m.group(1).split(',')
domainl.reverse()
domain = '.'.join(domainl)
port = m.group(2)
return f'{domain}:{port}' if port else domain
return Path(*((domain(parts[0]),) + parts[1:]))
def strip_illegals(path):
def strip(s):
return s.replace(':', '_')
return Path(*(strip(p) for p in path.parts))
def row_path(row, isdupe):
if isdupe[row.urlkey]:
parts = urlkey_path(row.urlkey).parts
fsplit = parts[-1].split('.')
if len(fsplit) > 1:
fname = (f'{fsplit[0]}_{row.timestamp}_{row.digest}'
f".{'.'.join(fsplit[1:])}")
else:
fname = f'{fsplit[0]}_{row.timestamp}_{row.digest}'
return strip_illegals(Path(*(parts[:-1] + (fname,))))
else:
return strip_illegals(urlkey_path(row.urlkey))
urlkey_dupes = Counter(row.urlkey for row in listing)
urlkey_isdupe = {urlkey: True if urlkey_dupes[urlkey] > 1 else False
for urlkey in urlkey_dupes}
for row in listing:
out_path = row_path(row, urlkey_isdupe)
print(f'https://web.archive.org/web/{row.timestamp}id_/{row.original}')
print(f'\tout={out_path}')
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment