wvengen/scrapy_h2_proxy.py

## scrapy_h2_proxy.py
"""
Download handler for HTTP/2 supporting proxy CONNECT over HTTP/1.

You can use this by configuring it in the spider settings:

    DOWNLOAD_HANDLERS = {
        'https': 'scrapy_h2_proxy.H2DownloadHandler',
    }

Tested with Scrapy 2.5.0.
"""
import ipaddress
from collections import deque
from typing import Optional, Tuple

from twisted.internet import defer
from twisted.internet.base import ReactorBase
from twisted.internet.defer import Deferred
from twisted.internet.endpoints import HostnameEndpoint
from twisted.web.client import URI, BrowserLikePolicyForHTTPS, _StandardEndpointFactory

from scrapy.core.http2.agent import H2Agent, H2ConnectionPool as _H2ConnectionPool, H2ClientProtocol as _H2ClientProtocol, H2ClientFactory as _H2ClientFactory
from scrapy.core.downloader.contextfactory import load_context_factory_from_settings
from scrapy.core.downloader.webclient import _parse
from scrapy.core.downloader.handlers.http11 import TunnelingTCP4ClientEndpoint
from scrapy.core.downloader.handlers.http2 import ScrapyH2Agent as _ScrapyH2Agent, H2DownloadHandler as _H2DownloadHandler
from scrapy.crawler import Crawler
from scrapy.spiders import Spider
from scrapy.http.request import Request
from scrapy.settings import Settings
from scrapy.utils.python import to_bytes


class H2ClientProtocol(_H2ClientProtocol):
    def connectionMade(self) -> None:
        """Called by Twisted when the connection is established. We can start
        sending some data now: we should open with the connection preamble.

        The parent sends the preamble here, but that we need to avoid for the
        HTTP/1 proxying connection, hence we do it after TLS negotiation instead.
        """
        # Initialize the timeout
        self.setTimeout(self.IDLE_TIMEOUT)

        destination = self.transport.getPeer()
        self.metadata['ip_address'] = ipaddress.ip_address(destination.host)

        # omitting: Initiate H2 Connection

    def handshakeCompleted(self) -> None:
        """
        Initiate H2 Connection
        """
        # Do not call super() because it checks self.transport.negotiatedProtocol which somehow isn't defined in our case.
        #super(H2ClientProtocol, self).handshakeCompleted()

        # Initiate H2 Connection
        self.conn.initiate_connection()
        self._write_to_transport()

class H2ClientFactory(_H2ClientFactory):
    def buildProtocol(self, addr) -> H2ClientProtocol:
        """This method is exactly the same as its parent, but here H2ClientFactory references our own factory."""
        return H2ClientProtocol(self.uri, self.settings, self.conn_lost_deferred)

class H2ConnectionPool(_H2ConnectionPool):
    def _new_connection(self, key: Tuple, uri: URI, endpoint: HostnameEndpoint) -> Deferred:
        """This method is exactly the same as its parent, but here H2ClientFactory references our own factory."""
        self._pending_requests[key] = deque()

        conn_lost_deferred = Deferred()
        conn_lost_deferred.addCallback(self._remove_connection, key)

        factory = H2ClientFactory(uri, self.settings, conn_lost_deferred)
        conn_d = endpoint.connect(factory)
        conn_d.addCallback(self.put_connection, key)

        d = Deferred()
        self._pending_requests[key].append(d)
        return d

class ScrapyTunnelingH2OverH1Agent(H2Agent):
    """An agent that uses a L{TunnelingH2OverH1ClientEndpoint} to make HTTP/2
    downloads. It may look strange that we have chosen to subclass Agent and not
    ProxyAgent but consider that after the tunnel is opened the proxy is
    transparent to the client; thus the agent should behave like there is no
    proxy involved.
    """

    def __init__(
        self,
        reactor: ReactorBase,
        proxy_auth: Optional[str],
        proxy_uri: URI,
        pool: H2ConnectionPool,
        context_factory: BrowserLikePolicyForHTTPS = BrowserLikePolicyForHTTPS(),
        connect_timeout: Optional[float] = None,
        bind_address: Optional[bytes] = None,
    ) -> None:
        super(ScrapyTunnelingH2OverH1Agent, self).__init__(
            reactor=reactor,
            pool=pool,
            context_factory=context_factory,
            connect_timeout=connect_timeout,
            bind_address=bind_address,
        )
        self._proxy_uri = proxy_uri
        self._proxy_auth = proxy_auth

    def get_endpoint(self, uri: URI):
        return TunnelingTCP4ClientEndpoint(
            reactor=self._reactor,
            host=uri.host,
            port=uri.port,
            proxyConf=(self._proxy_uri.host, self._proxy_uri.port, self._proxy_auth),
            contextFactory=self._context_factory,
            timeout=self.endpoint_factory._connectTimeout,
            bindAddress=self.endpoint_factory._bindAddress,
        )

    def get_key(self, uri: URI) -> Tuple:
        """We use a combination of the proxy uri and destination uri, they all need their own connection."""
        return "http-proxy", self._proxy_uri.host, self._proxy_uri.port, uri.host, uri.port


class H2DownloadHandler(_H2DownloadHandler):
    def __init__(self, settings: Settings, crawler: Optional[Crawler] = None):
        """This method is exactly the same as its parent, but here H2ClientFactory references our own factory."""
        self._crawler = crawler

        from twisted.internet import reactor
        self._pool = H2ConnectionPool(reactor, settings)
        self._context_factory = load_context_factory_from_settings(settings, crawler)

    def download_request(self, request: Request, spider: Spider) -> Deferred:
        agent = ScrapyH2Agent(
            context_factory=self._context_factory,
            pool=self._pool,
            crawler=self._crawler,
        )
        return agent.download_request(request, spider)

class ScrapyH2Agent(_ScrapyH2Agent):
    _TunnelingH1OverH2Agent = ScrapyTunnelingH2OverH1Agent

    def __init__(
        self, context_factory,
        pool: H2ConnectionPool,
        connect_timeout: int = 10,
        bind_address: Optional[bytes] = None,
        crawler: Optional[Crawler] = None,
    ) -> None:
        super(ScrapyH2Agent, self).__init__(context_factory, pool, connect_timeout, bind_address, crawler)

    def _get_agent(self, request: Request, timeout: Optional[float]) -> H2Agent:
        from twisted.internet import reactor
        bind_address = request.meta.get('bindaddress') or self._bind_address
        proxy = request.meta.get('proxy')
        if proxy:
            _, _, _proxy_host, _proxy_port, proxy_params = _parse(proxy)
            scheme = _parse(request.url)[0]
            omit_connect_tunnel = b'noconnect' in proxy_params
            if scheme == b'https' and not omit_connect_tunnel:
                # this is what we want to implement
                proxy_auth = request.headers.get(b'Proxy-Authorization', None)
                return self._TunnelingH1OverH2Agent(
                    reactor=reactor,
                    proxy_auth=proxy_auth,
                    context_factory=self._context_factory,
                    proxy_uri=URI.fromBytes(to_bytes(proxy, encoding='ascii')),
                    connect_timeout=timeout,
                    bind_address=bind_address,
                    pool=self._pool,
                )

        # all the other cases are handled by the regular {ScrapyH2Agent}
        return super(ScrapyH2Agent, self)._get_agent(request, timeout)
	"""
	Download handler for HTTP/2 supporting proxy CONNECT over HTTP/1.

	You can use this by configuring it in the spider settings:

	DOWNLOAD_HANDLERS = {
	'https': 'scrapy_h2_proxy.H2DownloadHandler',
	}

	Tested with Scrapy 2.5.0.
	"""
	import ipaddress
	from collections import deque
	from typing import Optional, Tuple

	from twisted.internet import defer
	from twisted.internet.base import ReactorBase
	from twisted.internet.defer import Deferred
	from twisted.internet.endpoints import HostnameEndpoint
	from twisted.web.client import URI, BrowserLikePolicyForHTTPS, _StandardEndpointFactory

	from scrapy.core.http2.agent import H2Agent, H2ConnectionPool as _H2ConnectionPool, H2ClientProtocol as _H2ClientProtocol, H2ClientFactory as _H2ClientFactory
	from scrapy.core.downloader.contextfactory import load_context_factory_from_settings
	from scrapy.core.downloader.webclient import _parse
	from scrapy.core.downloader.handlers.http11 import TunnelingTCP4ClientEndpoint
	from scrapy.core.downloader.handlers.http2 import ScrapyH2Agent as _ScrapyH2Agent, H2DownloadHandler as _H2DownloadHandler
	from scrapy.crawler import Crawler
	from scrapy.spiders import Spider
	from scrapy.http.request import Request
	from scrapy.settings import Settings
	from scrapy.utils.python import to_bytes


	class H2ClientProtocol(_H2ClientProtocol):
	def connectionMade(self) -> None:
	"""Called by Twisted when the connection is established. We can start
	sending some data now: we should open with the connection preamble.

	The parent sends the preamble here, but that we need to avoid for the
	HTTP/1 proxying connection, hence we do it after TLS negotiation instead.
	"""
	# Initialize the timeout
	self.setTimeout(self.IDLE_TIMEOUT)

	destination = self.transport.getPeer()
	self.metadata['ip_address'] = ipaddress.ip_address(destination.host)

	# omitting: Initiate H2 Connection

	def handshakeCompleted(self) -> None:
	"""
	Initiate H2 Connection
	"""
	# Do not call super() because it checks self.transport.negotiatedProtocol which somehow isn't defined in our case.
	#super(H2ClientProtocol, self).handshakeCompleted()

	# Initiate H2 Connection
	self.conn.initiate_connection()
	self._write_to_transport()

	class H2ClientFactory(_H2ClientFactory):
	def buildProtocol(self, addr) -> H2ClientProtocol:
	"""This method is exactly the same as its parent, but here H2ClientFactory references our own factory."""
	return H2ClientProtocol(self.uri, self.settings, self.conn_lost_deferred)

	class H2ConnectionPool(_H2ConnectionPool):
	def _new_connection(self, key: Tuple, uri: URI, endpoint: HostnameEndpoint) -> Deferred:
	"""This method is exactly the same as its parent, but here H2ClientFactory references our own factory."""
	self._pending_requests[key] = deque()

	conn_lost_deferred = Deferred()
	conn_lost_deferred.addCallback(self._remove_connection, key)

	factory = H2ClientFactory(uri, self.settings, conn_lost_deferred)
	conn_d = endpoint.connect(factory)
	conn_d.addCallback(self.put_connection, key)

	d = Deferred()
	self._pending_requests[key].append(d)
	return d

	class ScrapyTunnelingH2OverH1Agent(H2Agent):
	"""An agent that uses a L{TunnelingH2OverH1ClientEndpoint} to make HTTP/2
	downloads. It may look strange that we have chosen to subclass Agent and not
	ProxyAgent but consider that after the tunnel is opened the proxy is
	transparent to the client; thus the agent should behave like there is no
	proxy involved.
	"""

	def __init__(
	self,
	reactor: ReactorBase,
	proxy_auth: Optional[str],
	proxy_uri: URI,
	pool: H2ConnectionPool,
	context_factory: BrowserLikePolicyForHTTPS = BrowserLikePolicyForHTTPS(),
	connect_timeout: Optional[float] = None,
	bind_address: Optional[bytes] = None,
	) -> None:
	super(ScrapyTunnelingH2OverH1Agent, self).__init__(
	reactor=reactor,
	pool=pool,
	context_factory=context_factory,
	connect_timeout=connect_timeout,
	bind_address=bind_address,
	)
	self._proxy_uri = proxy_uri
	self._proxy_auth = proxy_auth

	def get_endpoint(self, uri: URI):
	return TunnelingTCP4ClientEndpoint(
	reactor=self._reactor,
	host=uri.host,
	port=uri.port,
	proxyConf=(self._proxy_uri.host, self._proxy_uri.port, self._proxy_auth),
	contextFactory=self._context_factory,
	timeout=self.endpoint_factory._connectTimeout,
	bindAddress=self.endpoint_factory._bindAddress,
	)

	def get_key(self, uri: URI) -> Tuple:
	"""We use a combination of the proxy uri and destination uri, they all need their own connection."""
	return "http-proxy", self._proxy_uri.host, self._proxy_uri.port, uri.host, uri.port


	class H2DownloadHandler(_H2DownloadHandler):
	def __init__(self, settings: Settings, crawler: Optional[Crawler] = None):
	"""This method is exactly the same as its parent, but here H2ClientFactory references our own factory."""
	self._crawler = crawler

	from twisted.internet import reactor
	self._pool = H2ConnectionPool(reactor, settings)
	self._context_factory = load_context_factory_from_settings(settings, crawler)

	def download_request(self, request: Request, spider: Spider) -> Deferred:
	agent = ScrapyH2Agent(
	context_factory=self._context_factory,
	pool=self._pool,
	crawler=self._crawler,
	)
	return agent.download_request(request, spider)

	class ScrapyH2Agent(_ScrapyH2Agent):
	_TunnelingH1OverH2Agent = ScrapyTunnelingH2OverH1Agent

	def __init__(
	self, context_factory,
	pool: H2ConnectionPool,
	connect_timeout: int = 10,
	bind_address: Optional[bytes] = None,
	crawler: Optional[Crawler] = None,
	) -> None:
	super(ScrapyH2Agent, self).__init__(context_factory, pool, connect_timeout, bind_address, crawler)

	def _get_agent(self, request: Request, timeout: Optional[float]) -> H2Agent:
	from twisted.internet import reactor
	bind_address = request.meta.get('bindaddress') or self._bind_address
	proxy = request.meta.get('proxy')
	if proxy:
	_, _, _proxy_host, _proxy_port, proxy_params = _parse(proxy)
	scheme = _parse(request.url)[0]
	omit_connect_tunnel = b'noconnect' in proxy_params
	if scheme == b'https' and not omit_connect_tunnel:
	# this is what we want to implement
	proxy_auth = request.headers.get(b'Proxy-Authorization', None)
	return self._TunnelingH1OverH2Agent(
	reactor=reactor,
	proxy_auth=proxy_auth,
	context_factory=self._context_factory,
	proxy_uri=URI.fromBytes(to_bytes(proxy, encoding='ascii')),
	connect_timeout=timeout,
	bind_address=bind_address,
	pool=self._pool,
	)

	# all the other cases are handled by the regular {ScrapyH2Agent}
	return super(ScrapyH2Agent, self)._get_agent(request, timeout)