Skip to content

Instantly share code, notes, and snippets.

Created July 4, 2023 14:03
Show Gist options
  • Save weiglemc/1fb86319177f98eb91e920155da720b5 to your computer and use it in GitHub Desktop.
Save weiglemc/1fb86319177f98eb91e920155da720b5 to your computer and use it in GitHub Desktop.
Python script to grab data from the Internet Archive via the CDX API server, uses function from Sawood Alam's CDXSummary tool
from requests import Session
from rich.console import Console
from urllib.parse import urlencode
URIR = ""
FROM = "20150424"
TO = "20220923"
OTHER_PARAMS = "&from=" + FROM + "&to=" + TO + "&collapse=timestamp:8&filter=statuscode:200" # only one entry per day, 200 OK
REQSESSION = Session()
errprint = Console(stderr=True, style="red", highlight=False).print
def get_stream_from_api(url):
pages = int(REQSESSION.get(f"{url}&showNumPages=true").text)
for page in range(pages):
pageurl = f"{url}&page={page}"
errprint(f"Downloading [[cyan]{page + 1}/{pages}[/cyan]]: [magenta]{pageurl}[/magenta]")
r = REQSESSION.get(pageurl, stream=True)
if r.ok:
r.raw.decode_content = True
for line in r.raw: yield line
def write_cdx (urir, cdxapi, params, outfile):
url = f"{cdxapi}?{params}&{urlencode({'url': urir})}"
input_stream = get_stream_from_api(url)
f = open(outfile, "w")
for line in input_stream: f.write(line.decode())
try: input_stream.close()
except: pass
cdxapi = ""
params = "matchType=exact" + OTHER_PARAMS
outfile = "cnn-" + FROM + "-" + TO + "-day.cdx"
write_cdx(URIR, cdxapi, params, outfile)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment