Skip to content

Instantly share code, notes, and snippets.

@srikanthbojja
Forked from vertigg/middleware.py
Created October 27, 2021 14:57
Show Gist options
  • Save srikanthbojja/542e281640546d4a9c12354e54ec009f to your computer and use it in GitHub Desktop.
Save srikanthbojja/542e281640546d4a9c12354e54ec009f to your computer and use it in GitHub Desktop.
#
# Example of automatical rotating proxy middleware for scrapy-rotating-proxies using proxybroker
#
import codecs
import logging
from subprocess import call
from rotating_proxies.expire import Proxies
from rotating_proxies.middlewares import RotatingProxyMiddleware
from scrapy import signals
from scrapy.exceptions import CloseSpider, NotConfigured
from scraper.settings import ROTATING_PROXY_LIST_PATH
logger = logging.getLogger(__name__)
class AutoRotatingProxyMiddleware(RotatingProxyMiddleware):
def reanimate_proxies(self):
"""Prevent dead proxies from reanimating"""
@staticmethod
def _refresh_proxy_list():
"""Calls external script for obtaining new list of proxies"""
call(['python', 'scraper/proxy.py'])
@classmethod
def _read_proxy_path_file(cls, proxy_path, reset_proxies=False):
"""
Reads text file with proxy list and re-obtain proxy list if given
file is empty. Also you can call force update on proxy list by
setting reset_proxies keyword to True
"""
if reset_proxies:
cls._refresh_proxy_list()
return cls._read_proxy_path_file(proxy_path)
with codecs.open(proxy_path, 'r', encoding='utf8') as f:
proxy_list = [line.strip() for line in f if line.strip()]
if not proxy_list:
logger.error('Proxy list is empty! Refreshing list')
cls._refresh_proxy_list()
return cls._read_proxy_path_file(proxy_path)
return proxy_list
@classmethod
def from_crawler(cls, crawler):
s = crawler.settings
proxy_path = s.get('ROTATING_PROXY_LIST_PATH', None)
if proxy_path is not None:
proxy_list = cls._read_proxy_path_file(proxy_path)
else:
proxy_list = s.getlist('ROTATING_PROXY_LIST')
if not proxy_list:
raise NotConfigured()
mw = cls(
proxy_list=proxy_list,
logstats_interval=s.getfloat(
'ROTATING_PROXY_LOGSTATS_INTERVAL', 30),
stop_if_no_proxies=s.getbool('ROTATING_PROXY_CLOSE_SPIDER', False),
max_proxies_to_try=s.getint('ROTATING_PROXY_PAGE_RETRY_TIMES', 5),
backoff_base=s.getfloat('ROTATING_PROXY_BACKOFF_BASE', 300),
backoff_cap=s.getfloat('ROTATING_PROXY_BACKOFF_CAP', 3600),
crawler=crawler,
)
crawler.signals.connect(mw.engine_started,
signal=signals.engine_started)
crawler.signals.connect(mw.engine_stopped,
signal=signals.engine_stopped)
return mw
def process_request(self, request, spider):
if 'proxy' in request.meta and not request.meta.get('_rotating_proxy'):
return
proxy = self.proxies.get_random()
if not proxy:
if self.stop_if_no_proxies:
raise CloseSpider("no_proxies")
else:
logger.warning("No proxies available; Reseting proxy list")
new_list = self._read_proxy_path_file(
ROTATING_PROXY_LIST_PATH, reset_proxies=True)
backoff = self.proxies.backoff
self.proxies = Proxies(
self.cleanup_proxy_list(new_list), backoff=backoff)
proxy = self.proxies.get_random()
if proxy is None:
logger.error("No proxies available even after a reset.")
raise CloseSpider("no_proxies_after_reset")
request.meta['proxy'] = proxy
request.meta['download_slot'] = self.get_proxy_slot(proxy)
request.meta['_rotating_proxy'] = True
import asyncio
import logging
import sys
from proxybroker import Broker
try:
from scraper.settings import PROXY_COUNT, PROXY_COUNTRIES
except ModuleNotFoundError:
import os
sys.path.append(os.path.abspath('.'))
from scraper.settings import PROXY_COUNT, PROXY_COUNTRIES
logging.basicConfig(stream=sys.stdout, level=logging.INFO,
format='[%(levelname)s]: %(message)s')
async def save(proxies, filename):
"""Save proxies to a file."""
with open(filename, 'w') as f:
while True:
proxy = await proxies.get()
if proxy is None:
break
f.write(f'{proxy.host}:{proxy.port}\n')
def main():
proxies = asyncio.Queue()
broker = Broker(proxies)
tasks = asyncio.gather(broker.find(
types=['HTTP', 'HTTPS'], limit=PROXY_COUNT, countries=PROXY_COUNTRIES),
save(proxies, filename='_input/proxylist.txt'))
loop = asyncio.get_event_loop()
loop.run_until_complete(tasks)
logging.info('Proxies obtained')
if __name__ == '__main__':
logging.info(f'Obtaining {PROXY_COUNT} proxies via proxybroker')
logging.info('Script settings can be edited in scrapper/settings.py')
main()
import os
BOT_NAME = 'scraper'
SPIDER_MODULES = ['scraper.spiders']
NEWSPIDER_MODULE = 'scraper.spiders'
# Files and directories
BASE_DIR = os.getcwd()
INPUT_FOLDER = os.path.join(BASE_DIR, '_input')
OUTPUT_FOLDER = os.path.join(BASE_DIR, '_output')
APP_FOLDER = os.path.join(BASE_DIR, 'scraper')
# Rotating proxies
PROXY_COUNT = 20
PROXY_COUNTRIES = ['DE', 'CH', 'FR']
ROTATING_PROXY_LIST_PATH = os.path.join(INPUT_FOLDER, 'proxylist.txt')
ROTATING_PROXY_BACKOFF_BASE = 30
ROTATING_PROXY_PAGE_RETRY_TIMES = 20
ROTATING_PROXY_BACKOFF_CAP = 7200
# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0'
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
CONCURRENT_REQUESTS = 16
DOWNLOAD_DELAY = 1.5
# CONCURRENT_REQUESTS_PER_DOMAIN = 1
DOWNLOADER_MIDDLEWARES = {
'scraper.middlewares.AutoRotatingProxyMiddleware': 610,
'rotating_proxies.middlewares.BanDetectionMiddleware': 620,
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment