Skip to content

Instantly share code, notes, and snippets.

@kmike
Last active September 14, 2018 16:24
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kmike/2112c0b7c3d28ab5047be9a4e6e6487c to your computer and use it in GitHub Desktop.
Save kmike/2112c0b7c3d28ab5047be9a4e6e6487c to your computer and use it in GitHub Desktop.
import scrapy
from scrapy_splash import LuaRequest
class BooksSpider(scrapy.Spider):
name = 'books'
allowed_domains = ['books.toscrape.com']
def start_requests(self):
yield LuaRequest(
'http://books.toscrape.com/',
lua_source="""
splash:go(args.url)
for i=1,50 do
splash:css(".next a"):click()
splash:wait(1.0)
end
return splash:html()
"""
)
def parse(self, response):
for book in response.css('article.product_pod'):
yield {
'title': book.css('h3 a::text').get(),
'url': book.css('h3 a::attr(href)').get(),
}
class BooksSpider2(scrapy.Spider):
name = 'books'
allowed_domains = ['books.toscrape.com']
def start_requests(self):
yield LuaRequest(
'http://books.toscrape.com/',
lua_source="""
splash:go(args.url)
for i=1,50 do
res={}
splash:css(".next a"):click()
splash:wait(1.0)
table.insert(res, splash:html())
end
return res
"""
)
def parse(self, response):
for html in response.data:
resp = TextResponse(response.url, body=html, encoding='utf8')
yield from self.parse_page(resp)
def parse_page(self, response):
for book in response.css('article.product_pod'):
yield {
'title': book.css('h3 a::text').get(),
'url': book.css('h3 a::attr(href)').get(),
}
class BooksSpider3(scrapy.Spider):
name = 'books'
allowed_domains = ['books.toscrape.com']
def start_requests(self):
yield SuperLuaRequest(
'http://books.toscrape.com/',
lua_source="""
splash:go(args.url)
for i=1,50 do
splash:css(".next a"):click()
splash:wait(1.0)
splash:send(splash:html())
end
"""
)
def parse(self, response):
for book in response.css('article.product_pod'):
yield {
'title': book.css('h3 a::text').get(),
'url': book.css('h3 a::attr(href)').get(),
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment