Last active
August 9, 2017 14:47
-
-
Save abehmiel/44f2cfb8b4c40ca5e901a6f190376513 to your computer and use it in GitHub Desktop.
Script to download all the CBS press releases. Prints verbose output to console and to file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup | |
from glob import glob | |
from os import makedirs | |
from os.path import join | |
from urllib.parse import urljoin | |
import requests | |
CBS_HOME_URL = 'https://www.cbscorporation.com/' | |
YEARS = {'2009': '7', '2010': '4', '2011': '11', '2012': '9', | |
'2013': '9', '2014': '9', '2015': '4', '2016': '6', '2017': '5'} | |
CAT_Q = '?cat=3' | |
makedirs('press-releases', exist_ok=True) | |
# Gather up all the index pages | |
# for each year | |
for year in YEARS: | |
for page in range(1, int(YEARS[year])+1): | |
this_url = CBS_HOME_URL + str(year) + '/page/' + str(page) +'/' + CAT_Q | |
print(this_url) | |
this_page = requests.get(this_url) | |
soup = BeautifulSoup(this_page.content, 'lxml') | |
i = 0 | |
for hed in soup.find_all('h1'): | |
i += 1 | |
href = hed.find('a').attrs['href'] | |
landed_url = urljoin('', href) | |
print("Downloading from...", landed_url) | |
pr_page = requests.get(landed_url) | |
pr_soup = BeautifulSoup(pr_page.content, 'lxml') | |
pr_text = pr_soup.find(class_ = 'entry-content').text | |
print(pr_text) | |
text_file = open('press-releases/' + str(year) + '-' + str(page) + '-' + str(i) + '.txt', "w") | |
text_file.write(pr_text) | |
text_file.close() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment