Skip to content

Instantly share code, notes, and snippets.

@mabelvj
Last active January 14, 2019 16:48
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mabelvj/83018adf07d112d9bfa0d9adc0249a29 to your computer and use it in GitHub Desktop.
Save mabelvj/83018adf07d112d9bfa0d9adc0249a29 to your computer and use it in GitHub Desktop.
nytimes spider
# -*- coding: utf-8 -*-
import scrapy
from scrapy_splash import SplashRequest
script = """
function main(splash)
local num_scrolls = 10
assert(splash:go(splash.args.url))
local get_dimensions = splash:jsfunc([[
function () {
var rect = document.getElementsByClassName('css-vsuiox')[0]
.getElementsByTagName('button')[0]
.getClientRects()[0];
return {"x": rect.left, "y": rect.top};
}
]])
local scroll_to = splash:jsfunc("window.scrollTo")
local get_body_height = splash:jsfunc(
"function() {return document.body.scrollHeight;}"
)
-- Loop and click `Show More` button multiple times
splash:set_viewport_full()
for i=1, num_scrolls do
scroll_to(0, get_body_height())
splash:wait(0.5) -- to avoid errors: TypeError: null is not an object
local dimensions = get_dimensions()
splash:mouse_click(dimensions.x, dimensions.y)
end
-- Wait split second to allow event to propagate.
splash:wait(0.1)
return splash:html()
end
"""
class NewsSpider(scrapy.Spider):
name = 'news'
allowed_domains = ['www.nytimes.com']
start_urls = ['http://www.nytimes.com/']
def __init__(self, start_date="2018/01/01", end_date="2019/01/01"):
self.urls = [self.format_query(url, start_date, end_date)
for url in self.start_urls]
@staticmethod
def format_query(url, start_date, end_date, sort='oldest'):
"""
Params:
start_date: string.
Format should be "%Y/%M/%d"
sort: string ('oldest', 'newest', 'best')
Example of url query:
"https://www.nytimes.com/search?endDate=20190101&query=archives&sort=best&startDate=20180101"
"""
def format_date(date): return date.replace('/', '')
start = format_date(start_date)
end = format_date(end_date)
sep = "/" if url[-1] != "/" else ""
query = (url + sep +
"search?" +
"endDate=%s" % end +
"&query=archives&sort=%s" % sort +
"&startDate=%s" % start)
return query
def start_requests(self):
for url in self.urls:
yield SplashRequest(url, self.parse,
endpoint='execute',
args={'lua_source': script,
'wait': 0.5},
dont_process_response=True,
)
def parse(self, response):
# from scrapy.shell import inspect_response
# inspect_response(response, self)
entries = response.xpath('//li[@data-testid="search-bodega-result"]')
titles = entries.xpath('//h4//text()').extract()
types = entries.xpath('//p[@class="css-myxawk"]//text()').extract()
dates = entries.xpath('//time//text()').extract()
for element in zip(titles, types, dates):
yield {'title': element[0], 'type': element[1], 'date': element[2]}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment