Last active
April 27, 2020 05:08
-
-
Save lethargilistic/e0d79b822530edeada5a1e196fd0cf1d to your computer and use it in GitHub Desktop.
Scraper for a year of the Publishers Weekly archive. Outputs each issue for one year as PDFs. Change global variables to configure.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
from calendar import weekday, monthrange, SATURDAY | |
#dependencies from pip | |
import requests | |
from PyPDF2 import PdfFileMerger | |
#----------- | |
#Change these values to... | |
SCRAPE_YEAR = 1892 #Select the year you wish to scrape. 1923 and before is public domain. | |
MAX_PAGES = 99999 #Set to 1000+ to ignore. After it ingests this many pages, it will output "--PARTIAL" file and move on. The average is ~24. The max I've seen is 156, which seemed like a book list. | |
#------------ | |
def make_url(year, month, day, page_number): | |
year = str(year) | |
month = str(month).zfill(2) | |
day = str(day).zfill(2) | |
return 'https://archive.publishersweekly.com/?a=is&oid=BG{}{}{}.1.{}&type=pagepdf'.format(year,month,day,page_number) | |
def download(year, month, day): | |
page_number = 1 | |
merger = PdfFileMerger() | |
iso_date = f'{year}-{month:02d}-{day:02d}' | |
#Exit early if Issue already downloaded | |
if os.path.isfile(f'{iso_date}.pdf'): | |
print(iso_date, 'ISSUE ALREADY DOWNLOADED') | |
return | |
print(iso_date, 'BEGIN') | |
while True: | |
filename = f'{iso_date}--{page_number}.pdf' | |
#Move to next page if page already downloaded | |
if os.path.isfile(filename): | |
print(iso_date, page_number, 'PAGE ALREADY DOWNLOADED') | |
page_number+=1 | |
continue | |
url = make_url(year, month, day, page_number) | |
with requests.get(url) as r: | |
if 'Invalid value' in r.text: #page does not exist, output to one pdf | |
if page_number == 1: | |
print(f'{iso_date} NO ISSUE FOR THIS DATE\nNEXT\n\n') | |
return | |
else: | |
print(f'{iso_date} MERGING...') | |
break | |
if page_number > MAX_PAGES: | |
print(f'{iso_date} PARTIAL MERGING...') | |
break | |
with open(filename, 'wb') as f: | |
f.write(r.content) #output individual page PDF | |
merger.append(filename) #add page to merger | |
print(iso_date, page_number) | |
page_number+=1 | |
#Output | |
if page_number <= MAX_PAGES: | |
merger.write(f'{iso_date}.pdf') #entire pdf | |
print(f'{iso_date} COMPLETE.') | |
else: | |
merger.write(f'{iso_date}--PARTIAL.pdf') #pdf was too long. output partial | |
print(f'{iso_date} PARTIAL COMPLETE.') | |
merger.close() | |
#Delete temp files | |
for i in range(1, page_number): | |
os.remove(f'{iso_date}--{i}.pdf') | |
print(f'{iso_date} DELETED INDIVIDUAL PAGE PDFS') | |
print('NEXT\n\n') | |
def get_issue_dates(): | |
issue_dates = [] #Tuples of (year, month, day) | |
for month in range(1, 13): | |
issue_dates.extend([(SCRAPE_YEAR, month, day+1) for day in range(*monthrange(SCRAPE_YEAR, month)) if weekday(SCRAPE_YEAR, month, day+1)==SATURDAY]) | |
return issue_dates | |
def main(): | |
issues = get_issue_dates() | |
print(f'DOWNLOADING: {issues}') | |
for issue in issues: | |
download(*issue) | |
print(f'FINISHED {SCRAPE_YEAR}') | |
print('CONGRATULATIONS.') | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment