Skip to content

Instantly share code, notes, and snippets.

@jerryan999
Created July 26, 2018 13:44
Show Gist options
  • Save jerryan999/4b2a17cc3d0d4852e3de7a4d68737c8e to your computer and use it in GitHub Desktop.
Save jerryan999/4b2a17cc3d0d4852e3de7a4d68737c8e to your computer and use it in GitHub Desktop.
class ProxyMiddleware(object):
""" Customized Proxy Middleware
No matter success or fail, change proxy for every request
"""
# Change another proxy instead of passing to RetryMiddlewares when met these errors
DONT_RETRY_ERRORS = (TimeoutError,ConnectionRefusedError,TCPTimedOutError,
ResponseNeverReceived, ConnectError, ConnectBindError, TunnelError)
agent_list = []
def __init__(self,settings):
config = settings.get('CONFIG')
try:
if (config is not None) and ('mongo_proxy' in config):
self.client = pymongo.MongoClient(config['mongo_proxy']['hosts'])
except:
logger.error("Wrong configuration,Please check database config file")
raise
self.db = self.client[config['mongo_proxy']['database']]
self.mongo_collection = self.db[config['mongo_proxy']['collection']]
self.max_pool_size = settings.get('MAX_POOL_SIZE',300)
self.min_pool_size = settings.get('MIN_POOL_SIZE',10)
self.pool_waiting = settings.get('POOL_WAITING',30)
self.better_percent = settings.get('BETTER_PERCENT',0.50)
self.better_max_count = settings.get('BETTER_MAX_COUNT',120)
self.maintaining_interval = settings.get('MAINTAIN_INTERVAL',360)
@classmethod
def from_crawler(cls, crawler):
settings = crawler.settings
o = cls(settings)
o.crawler=crawler
crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
crawler.signals.connect(o.spider_closed, signal=signals.spider_closed)
return o
def readProxyfile(self):
logger.debug("Starting getting fresh proxies")
# fetch _id value
for document in self.mongo_collection.find():
agent = Agent(document['_id'],success=1000, percentage=1) if document.get('label')=="purchased" else Agent(document['_id'])
if agent not in self.agent_list:
self.agent_list.append(agent)
def maintaining_agent(self):
""" if available agent number is below some level such as 80, we fill up the agent list
"""
# remove invalid
self.agent_list = list(filter(lambda x: x.is_valid(),self.agent_list))
# handle increasing size problem
max_proxy_size = self.max_pool_size
if len(self.agent_list)>max_proxy_size:
logger.debug("Proxy list is too big, here cuts the low part")
sortedagentlist = sorted(self.agent_list, key = lambda i: i.percentage)
self.agent_list = sortedagentlist[len(self.agent_list)-max_proxy_size:]
# add more proxy into pool
self.readProxyfile()
def get_proxy_slot(self, proxy):
"""
Return downloader slot for a proxy.
By default it doesn't take port in account, i.e. all proxies with
the same hostname / ip address share the same slot.
"""
return urlsplit(proxy).hostname
def process_request(self, request, spider):
""" Make request with agent
"""
if spider.name == "ZillowSpider" and "zillow" not in request.url[:22]:
raise IgnoreRequest
if request.url=="https://www.zillow.com:443/homes/":
raise IgnoreRequest
request.meta['agent'] = random.choice(list(filter(lambda x: x.is_valid(),self.agent_list)))
request.meta['proxy'] = request.meta['agent'].proxy
request.meta['download_slot'] = self.get_proxy_slot(request.meta['proxy'])
logger.debug("Request %(request)s using proxy:%(proxy)s",
{'request':request, 'proxy':request.meta['proxy']})
def _new_request_from_response(self,request):
new_request = request.copy()
new_request.dont_filter = True
return new_request
def process_exception(self, request, exception, spider):
"""Handle some connection error, make another request when these error happens
"""
agent = request.meta.get('agent')
for i in range(3):
agent.weaken()
if isinstance(exception,self.DONT_RETRY_ERRORS):
logger.debug("Normal exception happened proxy:{} for processing {}".format(request.meta['proxy'],request.url))
agent.weaken()
return self._new_request_from_response(request)
def spider_opened(self,spider):
self.task = task.LoopingCall(self.maintaining_agent)
self.task.start(self.maintaining_interval) # every 1 min = 60s
def spider_closed(self, spider, reason):
if self.task and self.task.running:
self.task.stop()
self.client.close()
def process_response(self, request, response, spider):
agent = request.meta.get('agent')
reason = response_status_message(response.status)
# handle different proxy situation
proxy_func = self.mapping_proxy(spider)
if proxy_func:
return proxy_func(request,response,agent,reason)
def mapping_proxy(self,spider):
if spider.name == 'AirbnbSpider':
return self.airbnb_proxy
elif spider.name == 'ZillowSpider':
return self.zillow_proxy
elif spider.name == 'CraigSpider':
return self.craig_proxy
def zillow_proxy(self,request,response,agent,reason):
""" Check response status and other validation info to decide whether to change a proxy or not
"""
if "zillow.service.search.SearchURL" in response.url:
raise IgnoreRequest
if response.status == 200:
if response.body: # sometimes empty return page but with 200 code
logger.debug("Good proxy:{} for processing {}".format(request.meta['proxy'],response))
if "AuthRequired" in response.url:
logger.debug("Status 200 to authrequired page, so we ignore it:{}".format(request))
raise IgnoreRequest
elif "captcha" in response.url:
logger.debug("Proxy:{} meets captcha when processing {}".format(request.meta['proxy'],response))
agent.weaken()
if "zpid" in response.url:
url_pa = re.search('url=(.+zpid)',response.url)
if url_pa:
url = "https://www.zillow.com"+url_pa.group(1).replace('%2f','/')
return Request(url=url,dont_filter=True,callback=self.crawler.spider.parse_apt_or_building)
else:
raise IgnoreRequest
else:
agent.stronger()
else:
logger.debug("Fake proxy:{} for processing {}".format(request.meta['proxy'],response))
agent.weaken()
return self._new_request_from_response(request)
return response
elif response.status in [302,307]:
# Redirecting (302) to <GET https://www.zillow.com/captcha/?dest=qiMbUzSCa1MGIlhrB-2stg> from <GET https://www.zillow.com/
location = safe_url_string(response.headers['location'])
redirected_url = urljoin(request.url, location)
if response.headers.get('location'):
if b'captcha' in response.headers[b'Location'] :
logger.debug("Redirecting (302) to captcha including url, so we make a new request for url:{}".format(request))
for k in range(2):
agent.weaken()
return self._new_request_from_response(request)
if b"AuthRequired" in response.headers[b'Location']:
logger.debug("Redirecting (302) to authrequired page, so we ignore it:{}".format(request))
agent.stronger()
raise IgnoreRequest
return response
return response
elif response.status ==403:
if 'zillow.com' in response.url:
agent.set_invalid()
logger.info("Proxy: {} meet {} ".format(agent.proxy,reason))
return self._new_request_from_response(request)
else:
raise IgnoreRequest
elif response.status == 404:
if "alogin" in response.url:
raise IgnoreRequest
return response
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment