Skip to content

Instantly share code, notes, and snippets.

@oiwn
Forked from anonymous/gist:4584171
Created January 21, 2013 08:31
Show Gist options
  • Save oiwn/4584571 to your computer and use it in GitHub Desktop.
Save oiwn/4584571 to your computer and use it in GitHub Desktop.
# -- coding: utf-8 --
import os
import pdb
#import random
from scrapy import log, signals
from scrapy.stats import stats
from scrapy.xlib.pydispatch import dispatcher
from scrapy.exceptions import IgnoreRequest
from scrapy.conf import settings
from scrapy.utils.job import job_dir
import hashlib
import urllib2
from scrapy.utils.url import parse_url
from scrapy.selector import HtmlXPathSelector
import urlparse
import random
class ProxyMiddleware(object):
PROXY_LIST = []
_last = -1
def __init__(self):
#pdb.set_trace()
handle = open('proxy.txt', "r")
proxy = []
for line in handle:
line = line.strip()
proxy.append(line)
#
import random
random.shuffle(proxy)
self.PROXY_LIST = proxy
def _get_proxy(self):
return random.choice(self.PROXY_LIST)
def process_response(self, request, response, spider):
reason = None
proxy = request.meta.get('proxy')
if proxy is None:
return response
if response.status == 503:
reason = 'http code'
if reason:
spider.log('proxy %s has been banned by %s' % (proxy, reason), log.ERROR)
#pdb.set_trace()
url = request.meta.get('redirect_urls').pop()
request = request.replace(url=url)
proxy = self._get_proxy()
request.meta['proxy'] = 'http://' + proxy
request.meta['_repeated'] = True
request.dont_filter = True
return request
return response
def process_request(self, request, spider):
#pdb.set_trace()
if request.meta.get('retry_times') and request.meta.get('_do_not_use_proxy') is None:
request.meta.update({'retry_times': 0})
proxy = self._get_proxy()
request.meta['proxy'] = 'http://' + proxy
request.meta['_repeated'] = True
request.meta.update({'cookiejar': proxy})
request.dont_filter = True
if request.meta.get('_do_not_use_proxy') is None:
proxy = self._get_proxy()
request.meta['proxy'] = 'http://' + proxy
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment