tsujamin/scrape-v2.py

## scrape-v2.py
import requests
import time

WORONI_IDS = list()
WORONI_PDFS = list()

BASE_SERVER = "http://api.trove.nla.gov.au"
ISSUE_SEARCH = "/newspaper/title/666?encoding=json&include=years&include=years&range=19500614-20071101"
KEY_PARAM = "&key=21cu4u3jraqik0uq"

print("scraping article IDs")

while True:
    url = BASE_SERVER + ISSUE_SEARCH + KEY_PARAM
    print(url)
    resp = requests.get(url)

    if resp.status_code == 403:
        print("rate limited. sleeping 15s and retrying...")
        time.sleep(15)

    try:
        years = resp.json()["newspaper"]["year"]
    except ValueError:
        print("failed to request... retrying")
        continue

    for year in years:
        for issue in year["issue"]:
            WORONI_IDS.append((issue["date"], int(issue["id"])))
            print(issue["id"])

    break

print("{} issues...".format(len(WORONI_IDS)))

f = open("./ids.txt", "w")
f.writelines(map(lambda x: "{}\n".format(x), WORONI_IDS))
f.close()

PDF_BASE = "http://trove.nla.gov.au/newspaper/rendition/nla.news-issue"
#http://trove.nla.gov.au/newspaper/rendition/nla.news-issue1328883/prep?_=1472298408713
#http://trove.nla.gov.au/newspaper/rendition/nla.news-issue1328883.ping?followup=7ca7e0da48c26892bcb4ee80ce8061de&_=1472298408714
#http://trove.nla.gov.au/newspaper/rendition/nla.news-issue1328883.pdf?followup=7ca7e0da48c26892bcb4ee80ce8061de

for date, _id in WORONI_IDS:
    prep_url = "{}{}/prep?_={}".format(PDF_BASE, _id, int(time.time()))
    print(prep_url)
    prep = requests.get(prep_url)

    cookies = prep.cookies
    followup = prep.text
    print(followup)

    while True:
        ping_url = "{}{}.ping?followup={}&_={}".format(PDF_BASE, _id, followup, int(time.time()))
        ping = requests.get(ping_url, cookies=cookies)
        if ping.status_code == 200:
            break

    pdf_url = "{}{}.pdf?followup={}".format(PDF_BASE, _id, followup)
    pdf = requests.get(pdf_url, cookies=cookies,stream=True)

    with open("{}.pdf".format(date), 'wb') as fd:
        for chunk in pdf.iter_content(1024):
            fd.write(chunk)
	import requests
	import time

	WORONI_IDS = list()
	WORONI_PDFS = list()

	BASE_SERVER = "http://api.trove.nla.gov.au"
	ISSUE_SEARCH = "/newspaper/title/666?encoding=json&include=years&include=years&range=19500614-20071101"
	KEY_PARAM = "&key=21cu4u3jraqik0uq"

	print("scraping article IDs")

	while True:
	url = BASE_SERVER + ISSUE_SEARCH + KEY_PARAM
	print(url)
	resp = requests.get(url)

	if resp.status_code == 403:
	print("rate limited. sleeping 15s and retrying...")
	time.sleep(15)

	try:
	years = resp.json()["newspaper"]["year"]
	except ValueError:
	print("failed to request... retrying")
	continue

	for year in years:
	for issue in year["issue"]:
	WORONI_IDS.append((issue["date"], int(issue["id"])))
	print(issue["id"])

	break

	print("{} issues...".format(len(WORONI_IDS)))

	f = open("./ids.txt", "w")
	f.writelines(map(lambda x: "{}\n".format(x), WORONI_IDS))
	f.close()

	PDF_BASE = "http://trove.nla.gov.au/newspaper/rendition/nla.news-issue"
	#http://trove.nla.gov.au/newspaper/rendition/nla.news-issue1328883/prep?_=1472298408713
	#http://trove.nla.gov.au/newspaper/rendition/nla.news-issue1328883.ping?followup=7ca7e0da48c26892bcb4ee80ce8061de&_=1472298408714
	#http://trove.nla.gov.au/newspaper/rendition/nla.news-issue1328883.pdf?followup=7ca7e0da48c26892bcb4ee80ce8061de

	for date, _id in WORONI_IDS:
	prep_url = "{}{}/prep?_={}".format(PDF_BASE, _id, int(time.time()))
	print(prep_url)
	prep = requests.get(prep_url)

	cookies = prep.cookies
	followup = prep.text
	print(followup)

	while True:
	ping_url = "{}{}.ping?followup={}&_={}".format(PDF_BASE, _id, followup, int(time.time()))
	ping = requests.get(ping_url, cookies=cookies)
	if ping.status_code == 200:
	break

	pdf_url = "{}{}.pdf?followup={}".format(PDF_BASE, _id, followup)
	pdf = requests.get(pdf_url, cookies=cookies,stream=True)

	with open("{}.pdf".format(date), 'wb') as fd:
	for chunk in pdf.iter_content(1024):
	fd.write(chunk)