Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save IAlwaysBeCoding/438464eec197f546ad1b707dba1f7551 to your computer and use it in GitHub Desktop.
Save IAlwaysBeCoding/438464eec197f546ad1b707dba1f7551 to your computer and use it in GitHub Desktop.
Scrapy downloader middleware class for handling IPv6-only hosts.
# -*- coding: utf8 -*-
from __future__ import absolute_import, division, print_function
import socket
from urlparse import urlparse
class DnsResolverMiddleware(object):
"""
Downloader Middleware Class for address resolving.
It resolves domain to IPv4 \ IPv6 addresses.
Previously it was designed for IPv6-only hosts.
Algorithm:
1) Replace all domain names to addresses for a request.
2) Form a new request.
3) Perform this request (at Scrapy Engine).
4) Get a response.
5) Replaces addresses to names for this response.
6) Return response.
It works without any caching. So it may be quite slow.
TODO:
* Add caching [?].
"""
dns_dict = None
def process_request(self, request, spider):
"""
Replaces all domain names to addresses and forms a new request.
Also it replaces addresses to domain names
for a Referer-header.
:param request: the request being processed;
:param spider: dummy parameter in this case;
:return: None or a new request with replaced links.
"""
meta = getattr(request, 'meta', None)
if not meta:
meta = dict()
if not self.dns_dict:
self.dns_dict = dict()
# To resist infinite loop.
if meta.get('resolved_request'):
return None
domain_url = request.url
# Check if domain and address are computed already.
domain = meta.get('domain', str())
address = meta.get('address', str())
if not address or not address:
# Compute domain and IP-address.
domain = self.parse_host(domain_url)
address = self.compute_address(domain)
self.dns_dict.setdefault(domain, [])
self.dns_dict.setdefault(address, [])
if address not in self.dns_dict[domain]:
self.dns_dict[domain] += [address]
if domain not in self.dns_dict[address]:
self.dns_dict[address] += [domain]
# Get a new url with an address instead of domain.
address_url = self.convert_to_address(domain_url)
# To replace `Referer`.
headers = request.headers
if not headers:
headers = dict()
# Get a old `Referer` with an address instead of domain.
address_referer = request.headers.get('Referer', str())
# Get a new `Referer` with a domain.
domain_referer = self.convert_to_domain(address_referer)
# Form a new headers dict.
new_headers = dict(
headers,
# Store a new Host-header
# for correct resolving on the requested server.
Host=domain,
)
if domain_referer:
new_headers = dict(
new_headers,
# Store a new Referer-header
# for correct handling on the requested server
Referer=domain_referer
)
# Form a new meta dict.
new_meta = dict(
meta,
# Mark this request as processed to resist infinite loop.
resolved_request=True,
# Store some auxiliary data.
# It helps us to escape from unnecessary computations
# and debug'em all.
dns_dict=self.dns_dict,
domain=domain,
address=address,
url=dict(
domain=domain_url,
address=address_url,
),
referer=dict(
domain=domain_referer,
address=address_referer,
),
)
# Form a new request.
new_request = request.replace(
url=address_url,
headers=new_headers,
meta=new_meta,
)
return new_request
def process_response(self, request, response, spider):
"""
Tries to replace addresses to domain names.
Replaces are not so successfully as I expects.
May be bug in Scrapy Engine.
Nevertheless:
1) It gets `domain` and `address` from request's meta.
2) It replaces `address` to `domain` in request's url.
3) It replaces `address` to `domain` in response's url.
4) It forms a new response.
:param request: the request that originated the response;
:param response: the response being processed;
:param spider: dummy parameter in this case;
:return: None or a new request with replaced links.
"""
# Get meta from request. It is important.
meta = getattr(request, 'meta', None)
if not meta:
meta = dict()
if meta.get('resolved_response'):
return None
# Form a new url with domain instead of IP-address.
url = response.url
new_url = self.convert_to_domain(url)
# Form a new meta dict.
new_meta = dict(
meta,
# Mark this response as processed.
resolved_response=True,
)
# Form a new request with our new url.
new_request = request.replace(
url=new_url,
meta=new_meta
)
# Form a new request with our new url and request.
response.replace(
url=new_url,
request=new_request
)
return response
def convert_to_address(self, url):
"""
Replaces a domain to an address in the url.
This function is a stub for caching.
:param str url: original url;
:return: a new url.
:rtype: str
"""
domain = self.parse_host(url)
address_list = self.dns_dict.get(domain, [])
for address in address_list:
url = url.replace(domain, address)
return url
def convert_to_domain(self, url):
"""
Replaces an address to a domain in the url.
This function is a stub for caching.
:param str url: original url;
:return: a new url.
:rtype: str
"""
address = self.parse_host(url)
domain_list = self.dns_dict.get(address, [])
for domain in domain_list:
url = url.replace(address, domain)
return url
def parse_host(self, url):
"""
Returns a host part of url.
:param str url: source uri with IP or domain
:return: host of given url
:rtype: str
"""
parsed_uri = urlparse(url)
host = parsed_uri.netloc
return host
def compute_address(self, host_name):
"""
Returns a http-compatible IP-address of the given host.
This function is a stub for caching.
:param str host_name: symbolic name of host;
:return: string with IP-address.
:rtype: str
"""
address = self._compute_address(host_name)
return address
def _compute_address(self, host_name):
"""
Returns a http-compatible IP-address of the given host.
For IPv6 it wraps address into square brackets.
:param str host_name: symbolic name of host;
:return: string with IP-address.
:rtype: str
"""
addr_list = socket.getaddrinfo(host_name, 0)
for addr in addr_list:
(family, _socktype, _proto, _canonname, sockaddr) = addr
if socket.AF_INET6 == family:
(address, _port, _flow_info, _scope_id) = sockaddr
address = '[{address}]'.format(
address=address
)
return address
elif socket.AF_INET == family:
(address, _port) = sockaddr
return address
return host_name
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment