Created
December 10, 2016 00:50
-
-
Save tsujamin/23ad637499c7bf6f5eaef028df1a528e to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
import time | |
WORONI_IDS = list() | |
WORONI_PDFS = list() | |
BASE_SERVER = "http://api.trove.nla.gov.au" | |
ISSUE_SEARCH = "/newspaper/title/666?encoding=json&include=years&include=years&range=19500614-20071101" | |
KEY_PARAM = "&key=21cu4u3jraqik0uq" | |
print("scraping article IDs") | |
while True: | |
url = BASE_SERVER + ISSUE_SEARCH + KEY_PARAM | |
print(url) | |
resp = requests.get(url) | |
if resp.status_code == 403: | |
print("rate limited. sleeping 15s and retrying...") | |
time.sleep(15) | |
try: | |
years = resp.json()["newspaper"]["year"] | |
except ValueError: | |
print("failed to request... retrying") | |
continue | |
for year in years: | |
for issue in year["issue"]: | |
WORONI_IDS.append((issue["date"], int(issue["id"]))) | |
print(issue["id"]) | |
break | |
print("{} issues...".format(len(WORONI_IDS))) | |
f = open("./ids.txt", "w") | |
f.writelines(map(lambda x: "{}\n".format(x), WORONI_IDS)) | |
f.close() | |
PDF_BASE = "http://trove.nla.gov.au/newspaper/rendition/nla.news-issue" | |
#http://trove.nla.gov.au/newspaper/rendition/nla.news-issue1328883/prep?_=1472298408713 | |
#http://trove.nla.gov.au/newspaper/rendition/nla.news-issue1328883.ping?followup=7ca7e0da48c26892bcb4ee80ce8061de&_=1472298408714 | |
#http://trove.nla.gov.au/newspaper/rendition/nla.news-issue1328883.pdf?followup=7ca7e0da48c26892bcb4ee80ce8061de | |
for date, _id in WORONI_IDS: | |
prep_url = "{}{}/prep?_={}".format(PDF_BASE, _id, int(time.time())) | |
print(prep_url) | |
prep = requests.get(prep_url) | |
cookies = prep.cookies | |
followup = prep.text | |
print(followup) | |
while True: | |
ping_url = "{}{}.ping?followup={}&_={}".format(PDF_BASE, _id, followup, int(time.time())) | |
ping = requests.get(ping_url, cookies=cookies) | |
if ping.status_code == 200: | |
break | |
pdf_url = "{}{}.pdf?followup={}".format(PDF_BASE, _id, followup) | |
pdf = requests.get(pdf_url, cookies=cookies,stream=True) | |
with open("{}.pdf".format(date), 'wb') as fd: | |
for chunk in pdf.iter_content(1024): | |
fd.write(chunk) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment