Last active
April 27, 2020 05:04
-
-
Save lethargilistic/c3118caca6c40c8c0f2ec2076c11cb41 to your computer and use it in GitHub Desktop.
Scrape a single issue from Publishers Weekly's archive
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
#pip dependencies | |
import requests | |
from PyPDF2 import PdfFileMerger | |
#----------- | |
#Change these values to... | |
SCRAPE_YEAR = 1892 #Select the year you wish to scrape. 1923 and before is public domain. | |
SCRAPE_MONTH = 1 | |
SCRAPE_DAY= 30 | |
MAX_PAGES = 99999 #After it ingests this many pages, it will output "--PARTIAL" file and move on. The average is ~24. The max I've seen is 156, which seemed like a book list. Just set to 1000+ to ignore this. | |
#------------ | |
def make_url(year, month, day, page_number): | |
year = str(year) | |
month = str(month).zfill(2) | |
day = str(day).zfill(2) | |
return 'https://archive.publishersweekly.com/?a=is&oid=BG{}{}{}.1.{}&type=pagepdf'.format(year,month,day,page_number) | |
def download(year, month, day): | |
page_number = 1 | |
merger = PdfFileMerger() | |
iso_date = f'{year}-{month:02d}-{day:02d}' | |
#Exit early if Issue already downloaded | |
if os.path.isfile(f'{iso_date}.pdf'): | |
print(iso_date, 'ISSUE ALREADY DOWNLOADED') | |
return | |
print(iso_date, 'BEGIN') | |
while True: | |
filename = f'{iso_date}--{page_number}.pdf' | |
#Move to next page if page already downloaded | |
if os.path.isfile(filename): | |
print(iso_date, page_number, 'PAGE ALREADY DOWNLOADED') | |
page_number+=1 | |
continue | |
url = make_url(year, month, day, page_number) | |
r = requests.get(url) | |
if 'Invalid value' in r.text: #page does not exist, output to one pdf | |
print(f'{iso_date} MERGING...') | |
break | |
if page_number > MAX_PAGES: | |
print('{iso_date} PARTIAL MERGING...') | |
break | |
with open(filename, 'wb') as f: | |
f.write(r.content) #output page just in case | |
merger.append(filename) #add page to merger | |
print(iso_date, page_number) | |
page_number+=1 | |
#Output | |
if page_number <= MAX_PAGES: | |
merger.write(f'{iso_date}.pdf') #entire pdf | |
print(f'{iso_date} COMPLETE.') | |
else: | |
merger.write(f'{iso_date}--PARTIAL.pdf') #pdf was too long. output partial | |
print(f'{iso_date} PARTIAL COMPLETE.') | |
merger.close() | |
#Delete temp files | |
for i in range(1, page_number): | |
os.remove(f'{iso_date}--{i}.pdf') | |
print(f'{iso_date} DUMPED INDIVIDUAL PAGE PDFS') | |
def main(): | |
print(f'INDIVIDUAL YEAR SCRAPE') | |
download(SCRAPE_YEAR, SCRAPE_MONTH, SCRAPE_DAY) | |
print(f'CONGRATULATIONS.') | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment