coblezc/news-locations.py

## news-locations.py
from __future__ import unicode_literals
from bs4 import BeautifulSoup
import urllib
from selenium import webdriver
import time
import spacy
from twython import Twython
import json

# "loadin' the pipeline"
nlp = spacy.load('en')
# define dictionaries for scraping results
scrape = {}
locations = {}

# twitter credentials/authorization
with open('cred.json', 'r') as credfile:
	auth = json.load(credfile)
twitter = Twython(auth['APP_KEY'], auth['APP_SECRET'], auth['ACCESS_TOKEN'], auth[
	'ACCESS_TOKEN_SECRET'])

# get URLs of articles on NYTimes front page
start_url = 'https://www.nytimes.com'
html = urllib.urlopen(start_url).read()
soup = BeautifulSoup(html, 'html.parser')
titles = soup.select('h2.story-heading')
hrefs = []
for title in titles[3:4]:
	link = title.select('a')[0]
	href = link.get('href').encode('utf8')
	hrefs.append(href)

# get text of each article
driver = webdriver.Chrome("/Users/coblezc/Downloads/chromedriver")
def get_page():
	# timer delay between scrapes
	time.sleep(2)
	# get headline
	headline1 = driver.find_element_by_tag_name('h1').text
	headline = headline1.encode('utf-8')
	# dictionary data model: {headline: [pargraph1, p2, p3, etc], ...}
	scrape[headline] = []
	paragraphs = driver.find_elements_by_css_selector('p.story-body-text')
	for paras in paragraphs:
		# get text of each paragraph
		text_raw = paras.text
		text = text_raw.encode('utf-8')

		# spacy to extract locations
		doc = nlp(text.decode('utf8'))
		for ent in doc.ents:
		    if ent.label_ == 'GPE':
		    	scrape.setdefault(headline,[]).append(ent.text)

	# dedupe locations
	for key, value in scrape.items():
	    new_list = []
	    for item in value:
	        if item not in new_list:
				new_list.append(item)
		new_list_joined = ', '.join(new_list)
		locations[key] = new_list_joined

	# combine key/value into list
	collapse_dict = ['{}: {}'.format(k,v) for k,v in locations.iteritems()]
	# join list into string
	dict_to_string = ''.join(collapse_dict)
	# limit string to 140 characters
	toot = dict_to_string[:139]
	print toot
	# toot the toot
	twitter.update_status(status=toot)


for href in hrefs:
	driver.get(href)
	get_page()
	from __future__ import unicode_literals
	from bs4 import BeautifulSoup
	import urllib
	from selenium import webdriver
	import time
	import spacy
	from twython import Twython
	import json

	# "loadin' the pipeline"
	nlp = spacy.load('en')
	# define dictionaries for scraping results
	scrape = {}
	locations = {}

	# twitter credentials/authorization
	with open('cred.json', 'r') as credfile:
	auth = json.load(credfile)
	twitter = Twython(auth['APP_KEY'], auth['APP_SECRET'], auth['ACCESS_TOKEN'], auth[
	'ACCESS_TOKEN_SECRET'])

	# get URLs of articles on NYTimes front page
	start_url = 'https://www.nytimes.com'
	html = urllib.urlopen(start_url).read()
	soup = BeautifulSoup(html, 'html.parser')
	titles = soup.select('h2.story-heading')
	hrefs = []
	for title in titles[3:4]:
	link = title.select('a')[0]
	href = link.get('href').encode('utf8')
	hrefs.append(href)

	# get text of each article
	driver = webdriver.Chrome("/Users/coblezc/Downloads/chromedriver")
	def get_page():
	# timer delay between scrapes
	time.sleep(2)
	# get headline
	headline1 = driver.find_element_by_tag_name('h1').text
	headline = headline1.encode('utf-8')
	# dictionary data model: {headline: [pargraph1, p2, p3, etc], ...}
	scrape[headline] = []
	paragraphs = driver.find_elements_by_css_selector('p.story-body-text')
	for paras in paragraphs:
	# get text of each paragraph
	text_raw = paras.text
	text = text_raw.encode('utf-8')

	# spacy to extract locations
	doc = nlp(text.decode('utf8'))
	for ent in doc.ents:
	if ent.label_ == 'GPE':
	scrape.setdefault(headline,[]).append(ent.text)

	# dedupe locations
	for key, value in scrape.items():
	new_list = []
	for item in value:
	if item not in new_list:
	new_list.append(item)
	new_list_joined = ', '.join(new_list)
	locations[key] = new_list_joined

	# combine key/value into list
	collapse_dict = ['{}: {}'.format(k,v) for k,v in locations.iteritems()]
	# join list into string
	dict_to_string = ''.join(collapse_dict)
	# limit string to 140 characters
	toot = dict_to_string[:139]
	print toot
	# toot the toot
	twitter.update_status(status=toot)


	for href in hrefs:
	driver.get(href)
	get_page()