Created
July 29, 2019 22:10
-
-
Save vertigg/931ee25179547a4b1db1749a1317d487 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# | |
# Example of automatical rotating proxy middleware for scrapy-rotating-proxies using proxybroker | |
# | |
import codecs | |
import logging | |
from subprocess import call | |
from rotating_proxies.expire import Proxies | |
from rotating_proxies.middlewares import RotatingProxyMiddleware | |
from scrapy import signals | |
from scrapy.exceptions import CloseSpider, NotConfigured | |
from scraper.settings import ROTATING_PROXY_LIST_PATH | |
logger = logging.getLogger(__name__) | |
class AutoRotatingProxyMiddleware(RotatingProxyMiddleware): | |
def reanimate_proxies(self): | |
"""Prevent dead proxies from reanimating""" | |
@staticmethod | |
def _refresh_proxy_list(): | |
"""Calls external script for obtaining new list of proxies""" | |
call(['python', 'scraper/proxy.py']) | |
@classmethod | |
def _read_proxy_path_file(cls, proxy_path, reset_proxies=False): | |
""" | |
Reads text file with proxy list and re-obtain proxy list if given | |
file is empty. Also you can call force update on proxy list by | |
setting reset_proxies keyword to True | |
""" | |
if reset_proxies: | |
cls._refresh_proxy_list() | |
return cls._read_proxy_path_file(proxy_path) | |
with codecs.open(proxy_path, 'r', encoding='utf8') as f: | |
proxy_list = [line.strip() for line in f if line.strip()] | |
if not proxy_list: | |
logger.error('Proxy list is empty! Refreshing list') | |
cls._refresh_proxy_list() | |
return cls._read_proxy_path_file(proxy_path) | |
return proxy_list | |
@classmethod | |
def from_crawler(cls, crawler): | |
s = crawler.settings | |
proxy_path = s.get('ROTATING_PROXY_LIST_PATH', None) | |
if proxy_path is not None: | |
proxy_list = cls._read_proxy_path_file(proxy_path) | |
else: | |
proxy_list = s.getlist('ROTATING_PROXY_LIST') | |
if not proxy_list: | |
raise NotConfigured() | |
mw = cls( | |
proxy_list=proxy_list, | |
logstats_interval=s.getfloat( | |
'ROTATING_PROXY_LOGSTATS_INTERVAL', 30), | |
stop_if_no_proxies=s.getbool('ROTATING_PROXY_CLOSE_SPIDER', False), | |
max_proxies_to_try=s.getint('ROTATING_PROXY_PAGE_RETRY_TIMES', 5), | |
backoff_base=s.getfloat('ROTATING_PROXY_BACKOFF_BASE', 300), | |
backoff_cap=s.getfloat('ROTATING_PROXY_BACKOFF_CAP', 3600), | |
crawler=crawler, | |
) | |
crawler.signals.connect(mw.engine_started, | |
signal=signals.engine_started) | |
crawler.signals.connect(mw.engine_stopped, | |
signal=signals.engine_stopped) | |
return mw | |
def process_request(self, request, spider): | |
if 'proxy' in request.meta and not request.meta.get('_rotating_proxy'): | |
return | |
proxy = self.proxies.get_random() | |
if not proxy: | |
if self.stop_if_no_proxies: | |
raise CloseSpider("no_proxies") | |
else: | |
logger.warning("No proxies available; Reseting proxy list") | |
new_list = self._read_proxy_path_file( | |
ROTATING_PROXY_LIST_PATH, reset_proxies=True) | |
backoff = self.proxies.backoff | |
self.proxies = Proxies( | |
self.cleanup_proxy_list(new_list), backoff=backoff) | |
proxy = self.proxies.get_random() | |
if proxy is None: | |
logger.error("No proxies available even after a reset.") | |
raise CloseSpider("no_proxies_after_reset") | |
request.meta['proxy'] = proxy | |
request.meta['download_slot'] = self.get_proxy_slot(proxy) | |
request.meta['_rotating_proxy'] = True |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import asyncio | |
import logging | |
import sys | |
from proxybroker import Broker | |
try: | |
from scraper.settings import PROXY_COUNT, PROXY_COUNTRIES | |
except ModuleNotFoundError: | |
import os | |
sys.path.append(os.path.abspath('.')) | |
from scraper.settings import PROXY_COUNT, PROXY_COUNTRIES | |
logging.basicConfig(stream=sys.stdout, level=logging.INFO, | |
format='[%(levelname)s]: %(message)s') | |
async def save(proxies, filename): | |
"""Save proxies to a file.""" | |
with open(filename, 'w') as f: | |
while True: | |
proxy = await proxies.get() | |
if proxy is None: | |
break | |
f.write(f'{proxy.host}:{proxy.port}\n') | |
def main(): | |
proxies = asyncio.Queue() | |
broker = Broker(proxies) | |
tasks = asyncio.gather(broker.find( | |
types=['HTTP', 'HTTPS'], limit=PROXY_COUNT, countries=PROXY_COUNTRIES), | |
save(proxies, filename='_input/proxylist.txt')) | |
loop = asyncio.get_event_loop() | |
loop.run_until_complete(tasks) | |
logging.info('Proxies obtained') | |
if __name__ == '__main__': | |
logging.info(f'Obtaining {PROXY_COUNT} proxies via proxybroker') | |
logging.info('Script settings can be edited in scrapper/settings.py') | |
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
BOT_NAME = 'scraper' | |
SPIDER_MODULES = ['scraper.spiders'] | |
NEWSPIDER_MODULE = 'scraper.spiders' | |
# Files and directories | |
BASE_DIR = os.getcwd() | |
INPUT_FOLDER = os.path.join(BASE_DIR, '_input') | |
OUTPUT_FOLDER = os.path.join(BASE_DIR, '_output') | |
APP_FOLDER = os.path.join(BASE_DIR, 'scraper') | |
# Rotating proxies | |
PROXY_COUNT = 20 | |
PROXY_COUNTRIES = ['DE', 'CH', 'FR'] | |
ROTATING_PROXY_LIST_PATH = os.path.join(INPUT_FOLDER, 'proxylist.txt') | |
ROTATING_PROXY_BACKOFF_BASE = 30 | |
ROTATING_PROXY_PAGE_RETRY_TIMES = 20 | |
ROTATING_PROXY_BACKOFF_CAP = 7200 | |
# Crawl responsibly by identifying yourself (and your website) on the user-agent | |
USER_AGENT = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0' | |
# Obey robots.txt rules | |
ROBOTSTXT_OBEY = True | |
# Configure maximum concurrent requests performed by Scrapy (default: 16) | |
CONCURRENT_REQUESTS = 16 | |
DOWNLOAD_DELAY = 1.5 | |
# CONCURRENT_REQUESTS_PER_DOMAIN = 1 | |
DOWNLOADER_MIDDLEWARES = { | |
'scraper.middlewares.AutoRotatingProxyMiddleware': 610, | |
'rotating_proxies.middlewares.BanDetectionMiddleware': 620, | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This is a great gist. Thanks for posting.