Instantly share code, notes, and snippets.

Embed
What would you like to do?
Script to download all the CBS press releases. Prints verbose output to console and to file.
from bs4 import BeautifulSoup
from glob import glob
from os import makedirs
from os.path import join
from urllib.parse import urljoin
import requests
CBS_HOME_URL = 'https://www.cbscorporation.com/'
YEARS = {'2009': '7', '2010': '4', '2011': '11', '2012': '9',
'2013': '9', '2014': '9', '2015': '4', '2016': '6', '2017': '5'}
CAT_Q = '?cat=3'
makedirs('press-releases', exist_ok=True)
# Gather up all the index pages
# for each year
for year in YEARS:
for page in range(1, int(YEARS[year])+1):
this_url = CBS_HOME_URL + str(year) + '/page/' + str(page) +'/' + CAT_Q
print(this_url)
this_page = requests.get(this_url)
soup = BeautifulSoup(this_page.content, 'lxml')
i = 0
for hed in soup.find_all('h1'):
i += 1
href = hed.find('a').attrs['href']
landed_url = urljoin('', href)
print("Downloading from...", landed_url)
pr_page = requests.get(landed_url)
pr_soup = BeautifulSoup(pr_page.content, 'lxml')
pr_text = pr_soup.find(class_ = 'entry-content').text
print(pr_text)
text_file = open('press-releases/' + str(year) + '-' + str(page) + '-' + str(i) + '.txt', "w")
text_file.write(pr_text)
text_file.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment