Skip to content

Instantly share code, notes, and snippets.

@wvengen
Last active Jul 21, 2022
Embed
What would you like to do?
Scrapy download handler for HTTP/2 over a HTTP/1 proxy (TLS only).
"""
Download handler for HTTP/2 supporting proxy CONNECT over HTTP/1.
You can use this by configuring it in the spider settings:
DOWNLOAD_HANDLERS = {
'https': 'scrapy_h2_proxy.H2DownloadHandler',
}
Tested with Scrapy 2.5.0.
"""
import ipaddress
from collections import deque
from typing import Optional, Tuple
from twisted.internet import defer
from twisted.internet.base import ReactorBase
from twisted.internet.defer import Deferred
from twisted.internet.endpoints import HostnameEndpoint
from twisted.web.client import URI, BrowserLikePolicyForHTTPS, _StandardEndpointFactory
from scrapy.core.http2.agent import H2Agent, H2ConnectionPool as _H2ConnectionPool, H2ClientProtocol as _H2ClientProtocol, H2ClientFactory as _H2ClientFactory
from scrapy.core.downloader.contextfactory import load_context_factory_from_settings
from scrapy.core.downloader.webclient import _parse
from scrapy.core.downloader.handlers.http11 import TunnelingTCP4ClientEndpoint
from scrapy.core.downloader.handlers.http2 import ScrapyH2Agent as _ScrapyH2Agent, H2DownloadHandler as _H2DownloadHandler
from scrapy.crawler import Crawler
from scrapy.spiders import Spider
from scrapy.http.request import Request
from scrapy.settings import Settings
from scrapy.utils.python import to_bytes
class H2ClientProtocol(_H2ClientProtocol):
def connectionMade(self) -> None:
"""Called by Twisted when the connection is established. We can start
sending some data now: we should open with the connection preamble.
The parent sends the preamble here, but that we need to avoid for the
HTTP/1 proxying connection, hence we do it after TLS negotiation instead.
"""
# Initialize the timeout
self.setTimeout(self.IDLE_TIMEOUT)
destination = self.transport.getPeer()
self.metadata['ip_address'] = ipaddress.ip_address(destination.host)
# omitting: Initiate H2 Connection
def handshakeCompleted(self) -> None:
"""
Initiate H2 Connection
"""
# Do not call super() because it checks self.transport.negotiatedProtocol which somehow isn't defined in our case.
#super(H2ClientProtocol, self).handshakeCompleted()
# Initiate H2 Connection
self.conn.initiate_connection()
self._write_to_transport()
class H2ClientFactory(_H2ClientFactory):
def buildProtocol(self, addr) -> H2ClientProtocol:
"""This method is exactly the same as its parent, but here H2ClientFactory references our own factory."""
return H2ClientProtocol(self.uri, self.settings, self.conn_lost_deferred)
class H2ConnectionPool(_H2ConnectionPool):
def _new_connection(self, key: Tuple, uri: URI, endpoint: HostnameEndpoint) -> Deferred:
"""This method is exactly the same as its parent, but here H2ClientFactory references our own factory."""
self._pending_requests[key] = deque()
conn_lost_deferred = Deferred()
conn_lost_deferred.addCallback(self._remove_connection, key)
factory = H2ClientFactory(uri, self.settings, conn_lost_deferred)
conn_d = endpoint.connect(factory)
conn_d.addCallback(self.put_connection, key)
d = Deferred()
self._pending_requests[key].append(d)
return d
class ScrapyTunnelingH2OverH1Agent(H2Agent):
"""An agent that uses a L{TunnelingH2OverH1ClientEndpoint} to make HTTP/2
downloads. It may look strange that we have chosen to subclass Agent and not
ProxyAgent but consider that after the tunnel is opened the proxy is
transparent to the client; thus the agent should behave like there is no
proxy involved.
"""
def __init__(
self,
reactor: ReactorBase,
proxy_auth: Optional[str],
proxy_uri: URI,
pool: H2ConnectionPool,
context_factory: BrowserLikePolicyForHTTPS = BrowserLikePolicyForHTTPS(),
connect_timeout: Optional[float] = None,
bind_address: Optional[bytes] = None,
) -> None:
super(ScrapyTunnelingH2OverH1Agent, self).__init__(
reactor=reactor,
pool=pool,
context_factory=context_factory,
connect_timeout=connect_timeout,
bind_address=bind_address,
)
self._proxy_uri = proxy_uri
self._proxy_auth = proxy_auth
def get_endpoint(self, uri: URI):
return TunnelingTCP4ClientEndpoint(
reactor=self._reactor,
host=uri.host,
port=uri.port,
proxyConf=(self._proxy_uri.host, self._proxy_uri.port, self._proxy_auth),
contextFactory=self._context_factory,
timeout=self.endpoint_factory._connectTimeout,
bindAddress=self.endpoint_factory._bindAddress,
)
def get_key(self, uri: URI) -> Tuple:
"""We use a combination of the proxy uri and destination uri, they all need their own connection."""
return "http-proxy", self._proxy_uri.host, self._proxy_uri.port, uri.host, uri.port
class H2DownloadHandler(_H2DownloadHandler):
def __init__(self, settings: Settings, crawler: Optional[Crawler] = None):
"""This method is exactly the same as its parent, but here H2ClientFactory references our own factory."""
self._crawler = crawler
from twisted.internet import reactor
self._pool = H2ConnectionPool(reactor, settings)
self._context_factory = load_context_factory_from_settings(settings, crawler)
def download_request(self, request: Request, spider: Spider) -> Deferred:
agent = ScrapyH2Agent(
context_factory=self._context_factory,
pool=self._pool,
crawler=self._crawler,
)
return agent.download_request(request, spider)
class ScrapyH2Agent(_ScrapyH2Agent):
_TunnelingH1OverH2Agent = ScrapyTunnelingH2OverH1Agent
def __init__(
self, context_factory,
pool: H2ConnectionPool,
connect_timeout: int = 10,
bind_address: Optional[bytes] = None,
crawler: Optional[Crawler] = None,
) -> None:
super(ScrapyH2Agent, self).__init__(context_factory, pool, connect_timeout, bind_address, crawler)
def _get_agent(self, request: Request, timeout: Optional[float]) -> H2Agent:
from twisted.internet import reactor
bind_address = request.meta.get('bindaddress') or self._bind_address
proxy = request.meta.get('proxy')
if proxy:
_, _, _proxy_host, _proxy_port, proxy_params = _parse(proxy)
scheme = _parse(request.url)[0]
omit_connect_tunnel = b'noconnect' in proxy_params
if scheme == b'https' and not omit_connect_tunnel:
# this is what we want to implement
proxy_auth = request.headers.get(b'Proxy-Authorization', None)
return self._TunnelingH1OverH2Agent(
reactor=reactor,
proxy_auth=proxy_auth,
context_factory=self._context_factory,
proxy_uri=URI.fromBytes(to_bytes(proxy, encoding='ascii')),
connect_timeout=timeout,
bind_address=bind_address,
pool=self._pool,
)
# all the other cases are handled by the regular {ScrapyH2Agent}
return super(ScrapyH2Agent, self)._get_agent(request, timeout)
@scottw
Copy link

scottw commented Feb 3, 2022

This is just what I'm looking for! How would I set up Scrapy so that HTTP/1.1 requests can work at the same time? I.e., I have a site that serves some pages via HTTP/1.1 and other pages via HTTP/2. Here is the error I get:

2022-02-03 11:18:59 [scrapy.core.scraper] ERROR: Error downloading <GET https://site>
Traceback (most recent call last):
  File "/lib/python3.8/site-packages/scrapy/core/downloader/middleware.py", line 44, in process_request
    return (yield download_func(request=request, spider=spider))
twisted.web._newclient.ResponseFailed: [InactiveStreamClosed(<GET https://site>), <twisted.python.failure.Failure twisted.internet.error.ConnectionDone: Connection was closed cleanly.>]

If I disable the scrapy_h2_proxy downloader middleware, this particular page loads fine.

@pozernishku
Copy link

pozernishku commented Jul 5, 2022

@wvengen Is it possible to adjust this h2 handler to rotate proxy IPs (persistent=False), similar to:

from scrapy.core.downloader.handlers.http import HTTPDownloadHandler
from twisted.web.client import HTTPConnectionPool


class RotatingProxiesDownloadHandler(HTTPDownloadHandler):
    """Solves the problem related to https proxies rotation.
    https://github.com/scrapy/scrapy/issues/1807#issuecomment-199099063
    """

    def __init__(self, settings, crawler):
        super().__init__(settings, crawler)
        from twisted.internet import reactor

        self._pool = HTTPConnectionPool(reactor, persistent=False)

Thank you!

@bezkos
Copy link

bezkos commented Jul 19, 2022

@wvengen Is it possible to adjust this h2 handler to close proxy connection when every h2 request finished?
My problem is that h2 dont support "Connection: Close" header so even if i pass this specific header, proxy dont close connection and i dont have a IP refresh.

@pozernishku Did u find any solution to your problem cause i have a similar one with "Connection: close"?

@pozernishku
Copy link

pozernishku commented Jul 21, 2022

@bezkos no

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment