Skip to content

Instantly share code, notes, and snippets.

@abehmiel
Last active August 9, 2017 14:47
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save abehmiel/44f2cfb8b4c40ca5e901a6f190376513 to your computer and use it in GitHub Desktop.
Save abehmiel/44f2cfb8b4c40ca5e901a6f190376513 to your computer and use it in GitHub Desktop.
Script to download all the CBS press releases. Prints verbose output to console and to file.
from bs4 import BeautifulSoup
from glob import glob
from os import makedirs
from os.path import join
from urllib.parse import urljoin
import requests
CBS_HOME_URL = 'https://www.cbscorporation.com/'
YEARS = {'2009': '7', '2010': '4', '2011': '11', '2012': '9',
'2013': '9', '2014': '9', '2015': '4', '2016': '6', '2017': '5'}
CAT_Q = '?cat=3'
makedirs('press-releases', exist_ok=True)
# Gather up all the index pages
# for each year
for year in YEARS:
for page in range(1, int(YEARS[year])+1):
this_url = CBS_HOME_URL + str(year) + '/page/' + str(page) +'/' + CAT_Q
print(this_url)
this_page = requests.get(this_url)
soup = BeautifulSoup(this_page.content, 'lxml')
i = 0
for hed in soup.find_all('h1'):
i += 1
href = hed.find('a').attrs['href']
landed_url = urljoin('', href)
print("Downloading from...", landed_url)
pr_page = requests.get(landed_url)
pr_soup = BeautifulSoup(pr_page.content, 'lxml')
pr_text = pr_soup.find(class_ = 'entry-content').text
print(pr_text)
text_file = open('press-releases/' + str(year) + '-' + str(page) + '-' + str(i) + '.txt', "w")
text_file.write(pr_text)
text_file.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment