Script to download all the CBS press releases. Prints verbose output to console and to file.
from bs4 import BeautifulSoup | |
from glob import glob | |
from os import makedirs | |
from os.path import join | |
from urllib.parse import urljoin | |
import requests | |
CBS_HOME_URL = 'https://www.cbscorporation.com/' | |
YEARS = {'2009': '7', '2010': '4', '2011': '11', '2012': '9', | |
'2013': '9', '2014': '9', '2015': '4', '2016': '6', '2017': '5'} | |
CAT_Q = '?cat=3' | |
makedirs('press-releases', exist_ok=True) | |
# Gather up all the index pages | |
# for each year | |
for year in YEARS: | |
for page in range(1, int(YEARS[year])+1): | |
this_url = CBS_HOME_URL + str(year) + '/page/' + str(page) +'/' + CAT_Q | |
print(this_url) | |
this_page = requests.get(this_url) | |
soup = BeautifulSoup(this_page.content, 'lxml') | |
i = 0 | |
for hed in soup.find_all('h1'): | |
i += 1 | |
href = hed.find('a').attrs['href'] | |
landed_url = urljoin('', href) | |
print("Downloading from...", landed_url) | |
pr_page = requests.get(landed_url) | |
pr_soup = BeautifulSoup(pr_page.content, 'lxml') | |
pr_text = pr_soup.find(class_ = 'entry-content').text | |
print(pr_text) | |
text_file = open('press-releases/' + str(year) + '-' + str(page) + '-' + str(i) + '.txt', "w") | |
text_file.write(pr_text) | |
text_file.close() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment