Created
October 25, 2017 15:01
-
-
Save coblezc/66429642ae9333c22b375d11e23e8b4f to your computer and use it in GitHub Desktop.
Extract locations mentioned in articles on front page of nytimes.com
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from __future__ import unicode_literals | |
from bs4 import BeautifulSoup | |
import urllib | |
from selenium import webdriver | |
import time | |
import spacy | |
from twython import Twython | |
import json | |
# "loadin' the pipeline" | |
nlp = spacy.load('en') | |
# define dictionaries for scraping results | |
scrape = {} | |
locations = {} | |
# twitter credentials/authorization | |
with open('cred.json', 'r') as credfile: | |
auth = json.load(credfile) | |
twitter = Twython(auth['APP_KEY'], auth['APP_SECRET'], auth['ACCESS_TOKEN'], auth[ | |
'ACCESS_TOKEN_SECRET']) | |
# get URLs of articles on NYTimes front page | |
start_url = 'https://www.nytimes.com' | |
html = urllib.urlopen(start_url).read() | |
soup = BeautifulSoup(html, 'html.parser') | |
titles = soup.select('h2.story-heading') | |
hrefs = [] | |
for title in titles[3:4]: | |
link = title.select('a')[0] | |
href = link.get('href').encode('utf8') | |
hrefs.append(href) | |
# get text of each article | |
driver = webdriver.Chrome("/Users/coblezc/Downloads/chromedriver") | |
def get_page(): | |
# timer delay between scrapes | |
time.sleep(2) | |
# get headline | |
headline1 = driver.find_element_by_tag_name('h1').text | |
headline = headline1.encode('utf-8') | |
# dictionary data model: {headline: [pargraph1, p2, p3, etc], ...} | |
scrape[headline] = [] | |
paragraphs = driver.find_elements_by_css_selector('p.story-body-text') | |
for paras in paragraphs: | |
# get text of each paragraph | |
text_raw = paras.text | |
text = text_raw.encode('utf-8') | |
# spacy to extract locations | |
doc = nlp(text.decode('utf8')) | |
for ent in doc.ents: | |
if ent.label_ == 'GPE': | |
scrape.setdefault(headline,[]).append(ent.text) | |
# dedupe locations | |
for key, value in scrape.items(): | |
new_list = [] | |
for item in value: | |
if item not in new_list: | |
new_list.append(item) | |
new_list_joined = ', '.join(new_list) | |
locations[key] = new_list_joined | |
# combine key/value into list | |
collapse_dict = ['{}: {}'.format(k,v) for k,v in locations.iteritems()] | |
# join list into string | |
dict_to_string = ''.join(collapse_dict) | |
# limit string to 140 characters | |
toot = dict_to_string[:139] | |
print toot | |
# toot the toot | |
twitter.update_status(status=toot) | |
for href in hrefs: | |
driver.get(href) | |
get_page() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment