abehmiel/FOXNEWS_scrape.py

## FOXNEWS_scrape.py
from bs4 import BeautifulSoup
from glob import glob
from os import makedirs
from os.path import join
from urllib.parse import urljoin
import requests

FOX_HOME_URL = 'http://press.foxnews.com/press-archive/'
YEARS = ['2011', '2012', '2013', '2014', '2015', '2016', '2017']
MONTHS = ['january', 'february', 'march', 'april', 'may', 'june',
          'july', 'august', 'september', 'october', 'november', 'december']

makedirs('press-releases', exist_ok=True)
# Gather up all the index pages
page_pattern = '2011/january-2011/'

# for each year
for year in YEARS:
    for month in MONTHS:
        this_url = FOX_HOME_URL + str(year) + '/' + str(month) +'-' + str(year) + '/'
        print(this_url)
        this_page = requests.get(this_url)
        soup = BeautifulSoup(this_page.content, 'lxml')
        i = 0
        for hed in soup.find_all('h3'):
            try:
                href = hed.find('a').attrs['href']
                landed_url = urljoin('', href)
                print("Downloading from...", landed_url)
                pr_page = requests.get(landed_url)
                pr_soup = BeautifulSoup(pr_page.content, 'lxml')
                pr_text = pr_soup.find(class_ = 'hentry').text
                print(pr_text)
                i += 1
                text_file = open('press-releases/' + str(year) + '-' + str(month) + '-' + str(i) + '.txt', "w")
                text_file.write(pr_text)
                text_file.close()
            except:
                pass
	from bs4 import BeautifulSoup
	from glob import glob
	from os import makedirs
	from os.path import join
	from urllib.parse import urljoin
	import requests

	FOX_HOME_URL = 'http://press.foxnews.com/press-archive/'
	YEARS = ['2011', '2012', '2013', '2014', '2015', '2016', '2017']
	MONTHS = ['january', 'february', 'march', 'april', 'may', 'june',
	'july', 'august', 'september', 'october', 'november', 'december']

	makedirs('press-releases', exist_ok=True)
	# Gather up all the index pages
	page_pattern = '2011/january-2011/'

	# for each year
	for year in YEARS:
	for month in MONTHS:
	this_url = FOX_HOME_URL + str(year) + '/' + str(month) +'-' + str(year) + '/'
	print(this_url)
	this_page = requests.get(this_url)
	soup = BeautifulSoup(this_page.content, 'lxml')
	i = 0
	for hed in soup.find_all('h3'):
	try:
	href = hed.find('a').attrs['href']
	landed_url = urljoin('', href)
	print("Downloading from...", landed_url)
	pr_page = requests.get(landed_url)
	pr_soup = BeautifulSoup(pr_page.content, 'lxml')
	pr_text = pr_soup.find(class_ = 'hentry').text
	print(pr_text)
	i += 1
	text_file = open('press-releases/' + str(year) + '-' + str(month) + '-' + str(i) + '.txt', "w")
	text_file.write(pr_text)
	text_file.close()
	except:
	pass