Skip to content

Instantly share code, notes, and snippets.

@tsujamin
Created December 10, 2016 00:50
Show Gist options
  • Save tsujamin/23ad637499c7bf6f5eaef028df1a528e to your computer and use it in GitHub Desktop.
Save tsujamin/23ad637499c7bf6f5eaef028df1a528e to your computer and use it in GitHub Desktop.
import requests
import time
WORONI_IDS = list()
WORONI_PDFS = list()
BASE_SERVER = "http://api.trove.nla.gov.au"
ISSUE_SEARCH = "/newspaper/title/666?encoding=json&include=years&include=years&range=19500614-20071101"
KEY_PARAM = "&key=21cu4u3jraqik0uq"
print("scraping article IDs")
while True:
url = BASE_SERVER + ISSUE_SEARCH + KEY_PARAM
print(url)
resp = requests.get(url)
if resp.status_code == 403:
print("rate limited. sleeping 15s and retrying...")
time.sleep(15)
try:
years = resp.json()["newspaper"]["year"]
except ValueError:
print("failed to request... retrying")
continue
for year in years:
for issue in year["issue"]:
WORONI_IDS.append((issue["date"], int(issue["id"])))
print(issue["id"])
break
print("{} issues...".format(len(WORONI_IDS)))
f = open("./ids.txt", "w")
f.writelines(map(lambda x: "{}\n".format(x), WORONI_IDS))
f.close()
PDF_BASE = "http://trove.nla.gov.au/newspaper/rendition/nla.news-issue"
#http://trove.nla.gov.au/newspaper/rendition/nla.news-issue1328883/prep?_=1472298408713
#http://trove.nla.gov.au/newspaper/rendition/nla.news-issue1328883.ping?followup=7ca7e0da48c26892bcb4ee80ce8061de&_=1472298408714
#http://trove.nla.gov.au/newspaper/rendition/nla.news-issue1328883.pdf?followup=7ca7e0da48c26892bcb4ee80ce8061de
for date, _id in WORONI_IDS:
prep_url = "{}{}/prep?_={}".format(PDF_BASE, _id, int(time.time()))
print(prep_url)
prep = requests.get(prep_url)
cookies = prep.cookies
followup = prep.text
print(followup)
while True:
ping_url = "{}{}.ping?followup={}&_={}".format(PDF_BASE, _id, followup, int(time.time()))
ping = requests.get(ping_url, cookies=cookies)
if ping.status_code == 200:
break
pdf_url = "{}{}.pdf?followup={}".format(PDF_BASE, _id, followup)
pdf = requests.get(pdf_url, cookies=cookies,stream=True)
with open("{}.pdf".format(date), 'wb') as fd:
for chunk in pdf.iter_content(1024):
fd.write(chunk)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment