Skip to content

Instantly share code, notes, and snippets.

@edsu
Last active July 13, 2023 16:14
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save edsu/9bb304836b4ba8adb64e3636e89f0c20 to your computer and use it in GitHub Desktop.
Save edsu/9bb304836b4ba8adb64e3636e89f0c20 to your computer and use it in GitHub Desktop.
Reads a text file of URLs and writes out a CSV report of whether the URL is in swap.stanford.edu
#!/usr/bin/env python3
import csv
import sys
import json
import time
import requests
def get_snapshots(url):
url = f"https://swap.stanford.edu/was/cdx?url={url}&output=json"
resp = requests.get(url)
if resp.status_code == 200 and resp.text:
return sorted(
[json.loads(line) for line in resp.text.strip().split("\n")],
key=lambda d: d['timestamp'],
reverse=True
)
else:
return
def main():
seed_filename = sys.argv[1]
writer = csv.writer(sys.stdout)
writer.writerow(["url", "latest_timestamp", "swap_url"])
for url in open(seed_filename):
url = url.strip()
snapshots = get_snapshots(url)
if snapshots is not None and len(snapshots) > 0:
ts = snapshots[0]["timestamp"]
writer.writerow([url, ts, f"https://swap.stanford.edu/was/{ts}/{url}"])
else:
writer.writerow([url, None, None])
time.sleep(1)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment