Last active
January 14, 2019 16:48
-
-
Save mabelvj/83018adf07d112d9bfa0d9adc0249a29 to your computer and use it in GitHub Desktop.
nytimes spider
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import scrapy | |
from scrapy_splash import SplashRequest | |
script = """ | |
function main(splash) | |
local num_scrolls = 10 | |
assert(splash:go(splash.args.url)) | |
local get_dimensions = splash:jsfunc([[ | |
function () { | |
var rect = document.getElementsByClassName('css-vsuiox')[0] | |
.getElementsByTagName('button')[0] | |
.getClientRects()[0]; | |
return {"x": rect.left, "y": rect.top}; | |
} | |
]]) | |
local scroll_to = splash:jsfunc("window.scrollTo") | |
local get_body_height = splash:jsfunc( | |
"function() {return document.body.scrollHeight;}" | |
) | |
-- Loop and click `Show More` button multiple times | |
splash:set_viewport_full() | |
for i=1, num_scrolls do | |
scroll_to(0, get_body_height()) | |
splash:wait(0.5) -- to avoid errors: TypeError: null is not an object | |
local dimensions = get_dimensions() | |
splash:mouse_click(dimensions.x, dimensions.y) | |
end | |
-- Wait split second to allow event to propagate. | |
splash:wait(0.1) | |
return splash:html() | |
end | |
""" | |
class NewsSpider(scrapy.Spider): | |
name = 'news' | |
allowed_domains = ['www.nytimes.com'] | |
start_urls = ['http://www.nytimes.com/'] | |
def __init__(self, start_date="2018/01/01", end_date="2019/01/01"): | |
self.urls = [self.format_query(url, start_date, end_date) | |
for url in self.start_urls] | |
@staticmethod | |
def format_query(url, start_date, end_date, sort='oldest'): | |
""" | |
Params: | |
start_date: string. | |
Format should be "%Y/%M/%d" | |
sort: string ('oldest', 'newest', 'best') | |
Example of url query: | |
"https://www.nytimes.com/search?endDate=20190101&query=archives&sort=best&startDate=20180101" | |
""" | |
def format_date(date): return date.replace('/', '') | |
start = format_date(start_date) | |
end = format_date(end_date) | |
sep = "/" if url[-1] != "/" else "" | |
query = (url + sep + | |
"search?" + | |
"endDate=%s" % end + | |
"&query=archives&sort=%s" % sort + | |
"&startDate=%s" % start) | |
return query | |
def start_requests(self): | |
for url in self.urls: | |
yield SplashRequest(url, self.parse, | |
endpoint='execute', | |
args={'lua_source': script, | |
'wait': 0.5}, | |
dont_process_response=True, | |
) | |
def parse(self, response): | |
# from scrapy.shell import inspect_response | |
# inspect_response(response, self) | |
entries = response.xpath('//li[@data-testid="search-bodega-result"]') | |
titles = entries.xpath('//h4//text()').extract() | |
types = entries.xpath('//p[@class="css-myxawk"]//text()').extract() | |
dates = entries.xpath('//time//text()').extract() | |
for element in zip(titles, types, dates): | |
yield {'title': element[0], 'type': element[1], 'date': element[2]} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment