Skip to content

Instantly share code, notes, and snippets.

@clemfromspace
Last active June 26, 2021 08:02
Show Gist options
  • Star 10 You must be signed in to star a gist
  • Fork 2 You must be signed in to fork a gist
  • Save clemfromspace/bbe9a99da1e86a753baa00e394234b8c to your computer and use it in GitHub Desktop.
Save clemfromspace/bbe9a99da1e86a753baa00e394234b8c to your computer and use it in GitHub Desktop.
Scrapy with Puppeteer
"""This module contains the ``SeleniumMiddleware`` scrapy middleware"""
import asyncio
from pyppeteer import launch
from scrapy import signals
from scrapy.http import HtmlResponse
from twisted.internet import defer
from .http import PuppeteerRequest
def _force_deferred(coro):
dfd = defer.Deferred().addCallback(lambda f: f.result())
future = asyncio.ensure_future(coro)
future.add_done_callback(dfd.callback)
return dfd
class PuppeteerMiddleware:
"""Scrapy middleware handling the requests using puppeteer"""
browser = None
@classmethod
async def _from_crawler(cls, crawler):
middleware = cls()
middleware.browser = await launch()
crawler.signals.connect(middleware.spider_closed, signals.spider_closed)
return middleware
@classmethod
def from_crawler(cls, crawler):
"""Initialize the middleware"""
return _force_deferred(cls._from_crawler(crawler))
async def _process_request(self, request, spider):
"""Process a request using puppeteer if applicable"""
if not isinstance(request, PuppeteerRequest):
return None
page = await self.browser.newPage()
await page.setCookie(request.cookies)
await page.setViewport({'width': 1200, 'height': 900, 'deviceScaleFactor': 2})
await page.goto(request.url, {'waitUntil': request.wait_until})
if request.screenshot:
request.meta['screenshot'] = await page.screenshot()
body = await page.content()
return HtmlResponse(
page.url(),
body=body,
encoding='utf-8',
request=request
)
def process_request(self, request, spider):
return _force_deferred(self._process_request(request, spider))
async def _spider_closed(self):
await self.browser.close()
def spider_closed(self):
"""Shutdown the driver when spider is closed"""
return _force_deferred(self._spider_closed())
"""This module contains the ``SeleniumRequest`` class"""
from scrapy import Request
class PuppeteerRequest(Request):
"""Scrapy ``Request`` subclass providing additional arguments"""
def __init__(self, screenshot=False, *args, **kwargs):
"""Initialize a new Puppeteer request
Parameters
----------
screenshot: bool
If True, a screenshot of the page will be taken and the data of the screenshot
will be returned in the response "meta" attribute.
"""
self.screenshot = screenshot
super().__init__(*args, **kwargs)
@NikolayGalkin
Copy link

Thanks for the gist. What about POST requests?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment