abehmiel/CBS_scrape.py

## CBS_scrape.py
from bs4 import BeautifulSoup
from glob import glob
from os import makedirs
from os.path import join
from urllib.parse import urljoin
import requests

CBS_HOME_URL = 'https://www.cbscorporation.com/'
YEARS = {'2009': '7', '2010': '4', '2011': '11', '2012': '9',
        '2013': '9', '2014': '9', '2015': '4', '2016': '6', '2017': '5'}
CAT_Q = '?cat=3'
makedirs('press-releases', exist_ok=True)
# Gather up all the index pages

# for each year
for year in YEARS:
    for page in range(1, int(YEARS[year])+1):
        this_url = CBS_HOME_URL + str(year) + '/page/' + str(page) +'/' + CAT_Q
        print(this_url)
        this_page = requests.get(this_url)
        soup = BeautifulSoup(this_page.content, 'lxml')
        i = 0
        for hed in soup.find_all('h1'):
            i += 1
            href = hed.find('a').attrs['href']
            landed_url = urljoin('', href)
            print("Downloading from...", landed_url)
            pr_page = requests.get(landed_url)
            pr_soup = BeautifulSoup(pr_page.content, 'lxml')
            pr_text = pr_soup.find(class_ = 'entry-content').text
            print(pr_text)
            text_file = open('press-releases/' + str(year) + '-' + str(page) + '-' + str(i) + '.txt', "w")
            text_file.write(pr_text)
            text_file.close()
	from bs4 import BeautifulSoup
	from glob import glob
	from os import makedirs
	from os.path import join
	from urllib.parse import urljoin
	import requests

	CBS_HOME_URL = 'https://www.cbscorporation.com/'
	YEARS = {'2009': '7', '2010': '4', '2011': '11', '2012': '9',
	'2013': '9', '2014': '9', '2015': '4', '2016': '6', '2017': '5'}
	CAT_Q = '?cat=3'
	makedirs('press-releases', exist_ok=True)
	# Gather up all the index pages

	# for each year
	for year in YEARS:
	for page in range(1, int(YEARS[year])+1):
	this_url = CBS_HOME_URL + str(year) + '/page/' + str(page) +'/' + CAT_Q
	print(this_url)
	this_page = requests.get(this_url)
	soup = BeautifulSoup(this_page.content, 'lxml')
	i = 0
	for hed in soup.find_all('h1'):
	i += 1
	href = hed.find('a').attrs['href']
	landed_url = urljoin('', href)
	print("Downloading from...", landed_url)
	pr_page = requests.get(landed_url)
	pr_soup = BeautifulSoup(pr_page.content, 'lxml')
	pr_text = pr_soup.find(class_ = 'entry-content').text
	print(pr_text)
	text_file = open('press-releases/' + str(year) + '-' + str(page) + '-' + str(i) + '.txt', "w")
	text_file.write(pr_text)
	text_file.close()