Extract locations mentioned in articles on front page of
from __future__ import unicode_literals
from bs4 import BeautifulSoup
import urllib
from selenium import webdriver
import time
import spacy
from twython import Twython
import json
# "loadin' the pipeline"
nlp = spacy.load('en')
# define dictionaries for scraping results
scrape = {}
locations = {}
# twitter credentials/authorization
with open('cred.json', 'r') as credfile:
auth = json.load(credfile)
twitter = Twython(auth['APP_KEY'], auth['APP_SECRET'], auth['ACCESS_TOKEN'], auth[
# get URLs of articles on NYTimes front page
start_url = ''
html = urllib.urlopen(start_url).read()
soup = BeautifulSoup(html, 'html.parser')
titles ='h2.story-heading')
hrefs = []
for title in titles[3:4]:
link ='a')[0]
href = link.get('href').encode('utf8')
# get text of each article
driver = webdriver.Chrome("/Users/coblezc/Downloads/chromedriver")
def get_page():
# timer delay between scrapes
# get headline
headline1 = driver.find_element_by_tag_name('h1').text
headline = headline1.encode('utf-8')
# dictionary data model: {headline: [pargraph1, p2, p3, etc], ...}
scrape[headline] = []
paragraphs = driver.find_elements_by_css_selector('p.story-body-text')
for paras in paragraphs:
# get text of each paragraph
text_raw = paras.text
text = text_raw.encode('utf-8')
# spacy to extract locations
doc = nlp(text.decode('utf8'))
for ent in doc.ents:
if ent.label_ == 'GPE':
# dedupe locations
for key, value in scrape.items():
new_list = []
for item in value:
if item not in new_list:
new_list_joined = ', '.join(new_list)
locations[key] = new_list_joined
# combine key/value into list
collapse_dict = ['{}: {}'.format(k,v) for k,v in locations.iteritems()]
# join list into string
dict_to_string = ''.join(collapse_dict)
# limit string to 140 characters
toot = dict_to_string[:139]
print toot
# toot the toot
for href in hrefs:
