Skip to content

Instantly share code, notes, and snippets.

@sbugrov
Last active August 22, 2017 17:40
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save sbugrov/5bce69d206d09536890b09fc380fce85 to your computer and use it in GitHub Desktop.
Save sbugrov/5bce69d206d09536890b09fc380fce85 to your computer and use it in GitHub Desktop.
# -*- coding: utf-8 -*-
# Sergei Bugrov
# 7-10-17
#
# Downloads all available articles from https://www.nytimes.com
#
# usage : python nytimes.py
#
# python version : 3.6.1
import requests, bs4, os, errno, time, datetime, re
def download_page(url):
try:
page = requests.get(url, timeout=10.0)
except requests.exceptions.Timeout:
print('Timeout\n')
return None
except requests.exceptions.ConnectionError:
print('ConnectionError\n')
time.sleep(120)
return None
except requests.exceptions.HTTPError:
print('HTTPError\n')
return None
except requests.exceptions.TooManyRedirects:
print('TooManyRedirects\n')
return None
else:
return page
def main():
max_attempts = 10
r_unwanted = re.compile('[\n\t\r]')
urls_to_articles = []
if not os.path.exists('articles/'):
try:
os.makedirs('articles/')
except OSError as e:
if e.errno != errno.EEXIST:
raise
# STEP 1. BUILD THE LIST OF URLS TO ARTICLES
if not os.path.exists('urls_to_articles.txt'):
for year in range(1987, datetime.datetime.now().year + 1):
catalog_page_by_years = 'http://spiderbites.nytimes.com/free_%s/index.html' % (year)
links_to_parts = []
attempts = 0
print('Year: ', year)
with open('logfile.log', 'w') as f:
f.write('STEP 1. Year: ' + str(year) + '\n')
catalog_page = download_page(catalog_page_by_years)
while not (catalog_page or attempts > max_attempts):
catalog_page = download_page(catalog_page_by_years)
attempts += 1
if catalog_page:
catalog_page = bs4.BeautifulSoup(catalog_page.text, "lxml")
if year > 1995:
links_to_parts.append(['http://spiderbites.nytimes.com%s' % (el.get('href')) for el in catalog_page.select('body > div > div > div > div > div > div > ul > li > a')])
else:
links_to_parts.append(['http://spiderbites.nytimes.com/free_%s/%s' % (year, el.get('href')) for el in catalog_page.select('body > div > div > div > div > div > div > ul > li > a')])
links_to_parts = [item for sublist in links_to_parts for item in sublist]
for link_to_parts in links_to_parts:
attempts = 0
parts_page = download_page(link_to_parts)
while not (parts_page or attempts > max_attempts):
parts_page = download_page(link_to_parts)
attempts += 1
if parts_page:
parts_page = bs4.BeautifulSoup(parts_page.text, "lxml")
urls_to_articles.append([el.get('href') for el in parts_page.select('body > div > div > div > div > ul > li > a')])
urls_to_articles = [item for sublist in urls_to_articles for item in sublist]
# Backing up the list of URLs
with open('urls_to_articles.txt', 'w') as output:
for u in urls_to_articles:
output.write('%s\n' % (u.strip()))
# STEP 2. DOWNLOAD ARTICLES
# If, at some point, Step 2 is interrupted due to unforeseen
# circumstances (power outage, loss of internet connection), replace the number
# (value of the variable url_num) below with the one you will find in the logfile.log
url_num = 0
if os.path.exists('urls_to_articles.txt') and len(urls_to_articles) == 0:
with open('urls_to_articles.txt', 'r') as f:
urls_to_articles = f.read().splitlines()
print('Number of articles that are about to be downloaded: ', len(urls_to_articles))
for url in urls_to_articles[url_num:]:
if len(url) > 34:
attempts = 0
if url_num % 1000 == 0:
print('Downloading article #', url_num, ' from ', url)
with open('logfile.log', 'w') as f:
f.write('STEP 2. Downloading article #' + str(url_num) + ' from ' + url + '\n')
article_page = download_page(url)
while not (article_page or attempts > max_attempts):
article_page = download_page(url)
attempts += 1
if article_page:
article_page = bs4.BeautifulSoup(article_page.text, "lxml")
title = [el.getText() for el in article_page.find_all(class_="articleHeadline")]
if len(title) > 0:
title = title[0]
else:
title = [el.getText() for el in article_page.find_all(class_="headline")]
if len(title) > 0:
title = title[0]
else:
title = ""
dateline = [el.getText() for el in article_page.find_all(class_="dateline")]
if len(dateline) > 0:
dateline = dateline[0]
else:
dateline = ""
byline = [el.getText().strip() for el in article_page.find_all(class_="byline")]
if len(byline) > 0:
byline = ' '.join(byline)
else:
byline = ""
body = [el.getText() for el in article_page.find_all(class_="articleBody")]
if len(body) > 0:
body = '\n'.join(body)
body = r_unwanted.sub("", body)
body = re.sub(' +', ' ', body)
with open('articles/' + str(url_num) + url.split('/')[-1] + '.txt', 'w') as output:
output.write('(c) ' + str(datetime.datetime.now().year) + ' The New York Times Company\n')
output.write(url + '\n')
output.write(title + '\n')
output.write(dateline + '\n')
output.write(byline + '\n')
output.write('\n' + body)
else:
body = [el.getText() for el in article_page.find_all(class_="story-body-text")]
if len(body) > 0:
body = '\n'.join(body)
body = r_unwanted.sub("", body)
body = re.sub(' +', ' ', body)
with open('articles/' + str(url_num) + url.split('/')[-1] + '.txt', 'w') as output:
output.write('(c) ' + str(datetime.datetime.now().year) + ' The New York Times Company\n')
output.write(url + '\n')
output.write(title + '\n')
output.write(dateline + '\n')
output.write(byline + '\n')
output.write('\n' + body)
url_num += 1
if __name__ == '__main__':
"""
The main function is called when nytimes.py is run from the command line
"""
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment