Last active
August 9, 2017 14:47
-
-
Save abehmiel/dced5101c33cb5d4cde819b3df0ba8fb to your computer and use it in GitHub Desktop.
Script to download all fox news press releases. Prints verbose output to console and file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup | |
from glob import glob | |
from os import makedirs | |
from os.path import join | |
from urllib.parse import urljoin | |
import requests | |
FOX_HOME_URL = 'http://press.foxnews.com/press-archive/' | |
YEARS = ['2011', '2012', '2013', '2014', '2015', '2016', '2017'] | |
MONTHS = ['january', 'february', 'march', 'april', 'may', 'june', | |
'july', 'august', 'september', 'october', 'november', 'december'] | |
makedirs('press-releases', exist_ok=True) | |
# Gather up all the index pages | |
page_pattern = '2011/january-2011/' | |
# for each year | |
for year in YEARS: | |
for month in MONTHS: | |
this_url = FOX_HOME_URL + str(year) + '/' + str(month) +'-' + str(year) + '/' | |
print(this_url) | |
this_page = requests.get(this_url) | |
soup = BeautifulSoup(this_page.content, 'lxml') | |
i = 0 | |
for hed in soup.find_all('h3'): | |
try: | |
href = hed.find('a').attrs['href'] | |
landed_url = urljoin('', href) | |
print("Downloading from...", landed_url) | |
pr_page = requests.get(landed_url) | |
pr_soup = BeautifulSoup(pr_page.content, 'lxml') | |
pr_text = pr_soup.find(class_ = 'hentry').text | |
print(pr_text) | |
i += 1 | |
text_file = open('press-releases/' + str(year) + '-' + str(month) + '-' + str(i) + '.txt', "w") | |
text_file.write(pr_text) | |
text_file.close() | |
except: | |
pass |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment