Skip to content

Instantly share code, notes, and snippets.

@zhangtemplar
Created February 12, 2019 02:34
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save zhangtemplar/e74bded6ab48c9d766a5dfb372b7edc4 to your computer and use it in GitHub Desktop.
Save zhangtemplar/e74bded6ab48c9d766a5dfb372b7edc4 to your computer and use it in GitHub Desktop.
Change to scrapy_selenium to allow change proxy for each SeleniumRequest.
"""This module contains the ``SeleniumMiddleware`` scrapy middleware"""
from importlib import import_module
from scrapy import signals
from scrapy.exceptions import NotConfigured
from scrapy.http import HtmlResponse
from selenium.webdriver.support.ui import WebDriverWait
from .http import SeleniumRequest
from proxy.pool import POOL
class SeleniumMiddleware:
"""Scrapy middleware handling the requests using selenium"""
def __init__(self, driver_name, driver_executable_path, driver_arguments,
browser_executable_path):
"""Initialize the selenium webdriver
Parameters
----------
driver_name: str
The selenium ``WebDriver`` to use
driver_executable_path: str
The path of the executable binary of the driver
driver_arguments: list
A list of arguments to initialize the driver
browser_executable_path: str
The path of the executable binary of the browser
"""
webdriver_base_path = f'selenium.webdriver.{driver_name}'
driver_klass_module = import_module(f'{webdriver_base_path}.webdriver')
self.driver_klass = getattr(driver_klass_module, 'WebDriver')
driver_options_module = import_module(f'{webdriver_base_path}.options')
self.driver_options_klass = getattr(driver_options_module, 'Options')
self.driver_executable_path = driver_executable_path
self.driver_name = driver_name
self.browser_executable_path = browser_executable_path
self.driver_arguments = driver_arguments
self.drivers = []
def _get_driver(self):
driver_options = self.driver_options_klass()
driver_options.add_argument("--proxy-server={}".format(POOL.get()))
if self.browser_executable_path:
driver_options.binary_location = self.browser_executable_path
for argument in self.driver_arguments:
driver_options.add_argument(argument)
driver_kwargs = {
'executable_path': self.driver_executable_path,
f'{self.driver_name}_options': driver_options
}
return self.driver_klass(**driver_kwargs)
@classmethod
def from_crawler(cls, crawler):
"""Initialize the middleware with the crawler settings"""
driver_name = crawler.settings.get('SELENIUM_DRIVER_NAME')
driver_executable_path = crawler.settings.get('SELENIUM_DRIVER_EXECUTABLE_PATH')
browser_executable_path = crawler.settings.get('SELENIUM_BROWSER_EXECUTABLE_PATH')
driver_arguments = crawler.settings.get('SELENIUM_DRIVER_ARGUMENTS')
if not driver_name or not driver_executable_path:
raise NotConfigured(
'SELENIUM_DRIVER_NAME and SELENIUM_DRIVER_EXECUTABLE_PATH must be set'
)
middleware = cls(
driver_name=driver_name,
driver_executable_path=driver_executable_path,
driver_arguments=driver_arguments,
browser_executable_path=browser_executable_path
)
crawler.signals.connect(middleware.spider_closed, signals.spider_closed)
return middleware
def process_request(self, request, spider):
"""Process a request using the selenium driver if applicable"""
if not isinstance(request, SeleniumRequest):
return None
driver = self._get_driver()
driver.get(request.url)
for cookie_name, cookie_value in request.cookies.items():
driver.add_cookie(
{
'name': cookie_name,
'value': cookie_value
}
)
if request.wait_until:
WebDriverWait(driver, request.wait_time).until(
request.wait_until
)
if request.screenshot:
request.meta['screenshot'] = driver.get_screenshot_as_png()
body = str.encode(driver.page_source)
# Expose the driver via the "meta" attribute
request.meta.update({'driver': driver})
return HtmlResponse(
driver.current_url,
body=body,
encoding='utf-8',
request=request
)
def spider_closed(self):
"""Shutdown the driver when spider is closed"""
for driver in self.drivers:
try:
driver.quit()
finally:
pass
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment