Skip to content

Instantly share code, notes, and snippets.

@abehmiel
Last active August 9, 2017 14:47
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save abehmiel/dced5101c33cb5d4cde819b3df0ba8fb to your computer and use it in GitHub Desktop.
Save abehmiel/dced5101c33cb5d4cde819b3df0ba8fb to your computer and use it in GitHub Desktop.
Script to download all fox news press releases. Prints verbose output to console and file.
from bs4 import BeautifulSoup
from glob import glob
from os import makedirs
from os.path import join
from urllib.parse import urljoin
import requests
FOX_HOME_URL = 'http://press.foxnews.com/press-archive/'
YEARS = ['2011', '2012', '2013', '2014', '2015', '2016', '2017']
MONTHS = ['january', 'february', 'march', 'april', 'may', 'june',
'july', 'august', 'september', 'october', 'november', 'december']
makedirs('press-releases', exist_ok=True)
# Gather up all the index pages
page_pattern = '2011/january-2011/'
# for each year
for year in YEARS:
for month in MONTHS:
this_url = FOX_HOME_URL + str(year) + '/' + str(month) +'-' + str(year) + '/'
print(this_url)
this_page = requests.get(this_url)
soup = BeautifulSoup(this_page.content, 'lxml')
i = 0
for hed in soup.find_all('h3'):
try:
href = hed.find('a').attrs['href']
landed_url = urljoin('', href)
print("Downloading from...", landed_url)
pr_page = requests.get(landed_url)
pr_soup = BeautifulSoup(pr_page.content, 'lxml')
pr_text = pr_soup.find(class_ = 'hentry').text
print(pr_text)
i += 1
text_file = open('press-releases/' + str(year) + '-' + str(month) + '-' + str(i) + '.txt', "w")
text_file.write(pr_text)
text_file.close()
except:
pass
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment