Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Extract locations mentioned in articles on front page of nytimes.com
from __future__ import unicode_literals
from bs4 import BeautifulSoup
import urllib
from selenium import webdriver
import time
import spacy
from twython import Twython
import json
# "loadin' the pipeline"
nlp = spacy.load('en')
# define dictionaries for scraping results
scrape = {}
locations = {}
# twitter credentials/authorization
with open('cred.json', 'r') as credfile:
auth = json.load(credfile)
twitter = Twython(auth['APP_KEY'], auth['APP_SECRET'], auth['ACCESS_TOKEN'], auth[
'ACCESS_TOKEN_SECRET'])
# get URLs of articles on NYTimes front page
start_url = 'https://www.nytimes.com'
html = urllib.urlopen(start_url).read()
soup = BeautifulSoup(html, 'html.parser')
titles = soup.select('h2.story-heading')
hrefs = []
for title in titles[3:4]:
link = title.select('a')[0]
href = link.get('href').encode('utf8')
hrefs.append(href)
# get text of each article
driver = webdriver.Chrome("/Users/coblezc/Downloads/chromedriver")
def get_page():
# timer delay between scrapes
time.sleep(2)
# get headline
headline1 = driver.find_element_by_tag_name('h1').text
headline = headline1.encode('utf-8')
# dictionary data model: {headline: [pargraph1, p2, p3, etc], ...}
scrape[headline] = []
paragraphs = driver.find_elements_by_css_selector('p.story-body-text')
for paras in paragraphs:
# get text of each paragraph
text_raw = paras.text
text = text_raw.encode('utf-8')
# spacy to extract locations
doc = nlp(text.decode('utf8'))
for ent in doc.ents:
if ent.label_ == 'GPE':
scrape.setdefault(headline,[]).append(ent.text)
# dedupe locations
for key, value in scrape.items():
new_list = []
for item in value:
if item not in new_list:
new_list.append(item)
new_list_joined = ', '.join(new_list)
locations[key] = new_list_joined
# combine key/value into list
collapse_dict = ['{}: {}'.format(k,v) for k,v in locations.iteritems()]
# join list into string
dict_to_string = ''.join(collapse_dict)
# limit string to 140 characters
toot = dict_to_string[:139]
print toot
# toot the toot
twitter.update_status(status=toot)
for href in hrefs:
driver.get(href)
get_page()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.