-
-
Save pawelmhm/176a4d01aea93c65bd64155c761fcc7d to your computer and use it in GitHub Desktop.
# -*- coding: utf-8 -*- | |
from __future__ import absolute_import, division, unicode_literals | |
import logging | |
from collections import OrderedDict | |
from time import time | |
from urllib.parse import urldefrag | |
from scrapy.core.downloader import Downloader | |
from scrapy.core.downloader.handlers.http11 import HTTP11DownloadHandler, \ | |
ScrapyAgent, _RequestBodyProducer | |
from scrapy.utils.python import to_bytes | |
from twisted.internet import reactor | |
from twisted.web.http_headers import Headers as TxHeaders | |
logger = logging.getLogger(__name__) | |
# To use that you need to do following. | |
# 1. Save it in some files, for example my_scrapy_project/ordered_downloader.py | |
# 2. Define this downloader in settings.py of your Scrapy project or in spider. | |
# For example in settings | |
# DOWNLOAD_HANDLERS_BASE = { | |
# 'data': 'scrapy.core.downloader.handlers.datauri.DataURIDownloadHandler', | |
# 'file': 'scrapy.core.downloader.handlers.file.FileDownloadHandler', | |
# 'http': 'scrapy.core.downloader.handlers.http.HTTPDownloadHandler', | |
# Custom downloader to preserve header | |
# order. Add path where you store your downloader. | |
# 'https': 'my_project.ordered_downloader.HeaderOrderDownloader', | |
# 's3': 'scrapy.core.downloader.handlers.s3.S3DownloadHandler', | |
# 'ftp': 'scrapy.core.downloader.handlers.ftp.FTPDownloadHandler', | |
# } | |
class OrderedHeaders(TxHeaders): | |
# Tweaked Twisted headers object that stores headers as OrderedDict and defines | |
# some order. First it sets ordered headers, then it adds headers that are not | |
# in defined ordering. | |
def __init__(self, rawHeaders=None): | |
# define your order here | |
ordering = [b'Host', b'User-Agent', b'Accept-Encoding', b'Accept', b'Connection', b'Cookie'] | |
self._rawHeaders = OrderedDict() | |
if rawHeaders is not None: | |
# Set ordered headers | |
for key in ordering: | |
values = rawHeaders.get(key) | |
if values and not isinstance(values, list): | |
values = rawHeaders.getlist(key) | |
self.setRawHeaders(key, values) | |
for name, values in rawHeaders.items(): | |
# Set remaining headers | |
if name not in ordering: | |
self.setRawHeaders(name, values) | |
def setRawHeaders(self, name, values): | |
""" | |
Copy pasted from Twisted, with only addition being adding move_to_end() method, | |
called after setting header. | |
""" | |
if not isinstance(values, list): | |
raise TypeError("Header entry %r should be list but found " | |
"instance of %r instead" % (name, type(values))) | |
name = self._encodeName(name) | |
self._rawHeaders[name] = self._encodeValues(values) | |
self._rawHeaders.move_to_end(name) | |
class ScrapyHeaderOrderAgent(ScrapyAgent): | |
# Copy pasted from Scrapy, tweaked to create OrderedHeaders, not usual Twisted headers, only | |
# difference from Scrapy is on line 81. | |
def download_request(self, request): | |
timeout = request.meta.get('download_timeout') or self._connectTimeout | |
agent = self._get_agent(request, timeout) | |
# request details | |
url = urldefrag(request.url)[0] | |
method = to_bytes(request.method) | |
headers = OrderedHeaders(request.headers) | |
if isinstance(agent, self._TunnelingAgent): | |
headers.removeHeader(b'Proxy-Authorization') | |
if request.body: | |
bodyproducer = _RequestBodyProducer(request.body) | |
elif method == b'POST': | |
# Setting Content-Length: 0 even for POST requests is not a | |
# MUST per HTTP RFCs, but it's common behavior, and some | |
# servers require this, otherwise returning HTTP 411 Length required | |
# | |
# RFC 7230#section-3.3.2: | |
# "a Content-Length header field is normally sent in a POST | |
# request even when the value is 0 (indicating an empty payload body)." | |
# | |
# Twisted < 17 will not add "Content-Length: 0" by itself; | |
# Twisted >= 17 fixes this; | |
# Using a producer with an empty-string sends `0` as Content-Length | |
# for all versions of Twisted. | |
bodyproducer = _RequestBodyProducer(b'') | |
else: | |
bodyproducer = None | |
start_time = time() | |
d = agent.request(method, to_bytes(url, encoding='ascii'), headers, | |
bodyproducer) | |
# set download latency | |
d.addCallback(self._cb_latency, request, start_time) | |
# response body is ready to be consumed | |
d.addCallback(self._cb_bodyready, request) | |
d.addCallback(self._cb_bodydone, request, url) | |
# check download timeout | |
self._timeout_cl = reactor.callLater(timeout, d.cancel) | |
d.addBoth(self._cb_timeout, request, url, timeout) | |
return d | |
class HeaderOrderDownloader(HTTP11DownloadHandler): | |
# Copy pasted from Scrapy, tweaked to use different Agent it uses ScrapyHeaderOrderAgent | |
def download_request(self, request, spider): | |
"""Return a deferred for the HTTP download""" | |
agent = ScrapyHeaderOrderAgent( | |
contextFactory=self._contextFactory, | |
pool=self._pool, | |
maxsize=getattr(spider, 'download_maxsize', self._default_maxsize), | |
warnsize=getattr(spider, 'download_warnsize', | |
self._default_warnsize), | |
fail_on_dataloss=self._fail_on_dataloss, | |
) | |
return agent.download_request(request) |
Summary
To backpack on top of this, I needed to avoid browser fingerprinting by ordering headers.
My goals were:
- Alphabetically sort header, including
Content-Length
for POST requests; - Make headers case sensitive
My Github repository is here. I hope this will be useful to you. It's very similar to this gist's strategy.
My reddit post
My stackoverflow post
Content-Length
header
As pawelmhm mentions, Content-Length
is added by Twisted. Specifically here for non-zero POST body and here for zero body responses. The breadcrumb trail is: ScrapyAgent
-> download_request
-> agent.request(...)
-> TunnelingAgent
-> Twisted: _requestWithEndpoint
-> Request._construct
-> _writeToBodyProducerContentLength
or _writeToEmptyBodyContentLength
.
These functions, _writeToBodyProducerContentLength
and _writeToEmptyBodyContentLength
call _writeHeaders
and do something like self._writeHeaders(transport, b"Content-Length: 0\r\n")
. As a result, Content-Length
is added to the top of your headers. If you try to include Content-Length
, something like this will happen and you will get a 400
error.
As a result, if you want to control Content-Length
, you must edit the Twisted code itself. See my repository README.md for instructions on that.
Case sensitive headers
This is more of a well-documented aspect. To do this, you modify the _caseMappings
attribute of the internal Twisted headers class. You are writing an over-write that tells Twisted to not abide by usual behavior for this header.
Let's say you have the header aAbBcCdD
, and you see it is getting sent as Aabbccdd
. You can go into your spider and before the request is yielded, add the code:
from twisted.web.http_headers import Headers as TwistedHeaders
# Preserve casing of headers
TwistedHeaders._caseMappings[b'aabbccdd'] = b'aAbBcCdD'
The key is made lowercase because twisted encodes the name into bytes via this function which calls .lower()
method.
Additionally resources for that can be found here:
Scrapy #2711
Developing
If you want to confirm if your headers are being sent as you expect, I'd recommend using Charles proxy or Fiddler. You can download Fiddler here. You can configure Fiddler for python applications by something like this. Essentially, you will end up sending your request to a localhost proxy by specifying in the meta attribute of the request, meta = {'proxy': '127.0.0.1:8866'}
.
Hello, anyway to make this workable in 23’?