Instantly share code, notes, and snippets.

Embed
What would you like to do?
Script to download all fox news press releases. Prints verbose output to console and file.
from bs4 import BeautifulSoup
from glob import glob
from os import makedirs
from os.path import join
from urllib.parse import urljoin
import requests
FOX_HOME_URL = 'http://press.foxnews.com/press-archive/'
YEARS = ['2011', '2012', '2013', '2014', '2015', '2016', '2017']
MONTHS = ['january', 'february', 'march', 'april', 'may', 'june',
'july', 'august', 'september', 'october', 'november', 'december']
makedirs('press-releases', exist_ok=True)
# Gather up all the index pages
page_pattern = '2011/january-2011/'
# for each year
for year in YEARS:
for month in MONTHS:
this_url = FOX_HOME_URL + str(year) + '/' + str(month) +'-' + str(year) + '/'
print(this_url)
this_page = requests.get(this_url)
soup = BeautifulSoup(this_page.content, 'lxml')
i = 0
for hed in soup.find_all('h3'):
try:
href = hed.find('a').attrs['href']
landed_url = urljoin('', href)
print("Downloading from...", landed_url)
pr_page = requests.get(landed_url)
pr_soup = BeautifulSoup(pr_page.content, 'lxml')
pr_text = pr_soup.find(class_ = 'hentry').text
print(pr_text)
i += 1
text_file = open('press-releases/' + str(year) + '-' + str(month) + '-' + str(i) + '.txt', "w")
text_file.write(pr_text)
text_file.close()
except:
pass
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment