Skip to content

Instantly share code, notes, and snippets.

@cporter
Created July 31, 2020 17:11
Show Gist options
  • Save cporter/9ebbfbd0f451688481c89b134e20c391 to your computer and use it in GitHub Desktop.
Save cporter/9ebbfbd0f451688481c89b134e20c391 to your computer and use it in GitHub Desktop.
import requests
import re
import sys
import os.path
import datetime
out = sys.argv[1]
def basename(x):
xs = x.split('/')
return xs[-1]
def slurp(href):
resp = requests.get(href)
with open(os.path.join(out, basename(href)), 'wb') as fp:
fp.write(resp.content)
PDF_BASE = 'http://cdn.newseum.org/dfp/pdf%d/%s.pdf'
IMG_BASE = 'http://cdn.newseum.org/dfp/jpg%d/lg/%s.jpg'
DOM = datetime.datetime.now().day
def grab(base):
img = IMG_BASE % (DOM, base)
pdf = PDF_BASE % (DOM, base)
slurp(img)
slurp(pdf)
root = 'http://newseum.org'
all = requests.get('http://www.newseum.org/todaysfrontpages/?tfp_show=all')
from bs4 import BeautifulSoup, SoupStrainer
only_a_tags = SoupStrainer("a")
seen = set()
find_page_name = re.compile('tfp_id=([A-Z_]+)')
def process_page(href):
m = find_page_name.search(href)
if m:
name = m.group(1)
if name not in seen:
print('Processing %s' % name)
grab(name)
seen.add(name)
# print('fetching %s' % href)
# page = requests.get('%s/%s' % (root, href))
# soup = BeautifulSoup(page.content, 'html.parser')
# for img in soup.find_all('img'):
# if img.get('src').endswith('%s.jpg' % name):
# slurp(img.get('src'))
# for a in soup.find_all('a'):
# href = a.get('href')
# if href and href.endswith('%s.pdf' % name):
# slurp(href)
soup = BeautifulSoup(all.content, 'html.parser')
for a in soup.find_all('a'):
href = a.get('href')
if 'tfp_show=all&tfp_id' in href:
process_page(href)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment