srikanthbojja/middleware.py

## middleware.py
#
# Example of automatical rotating proxy middleware for scrapy-rotating-proxies using proxybroker
#

import codecs
import logging
from subprocess import call

from rotating_proxies.expire import Proxies
from rotating_proxies.middlewares import RotatingProxyMiddleware
from scrapy import signals
from scrapy.exceptions import CloseSpider, NotConfigured

from scraper.settings import ROTATING_PROXY_LIST_PATH

logger = logging.getLogger(__name__)


class AutoRotatingProxyMiddleware(RotatingProxyMiddleware):

    def reanimate_proxies(self):
        """Prevent dead proxies from reanimating"""

    @staticmethod
    def _refresh_proxy_list():
        """Calls external script for obtaining new list of proxies"""
        call(['python', 'scraper/proxy.py'])

    @classmethod
    def _read_proxy_path_file(cls, proxy_path, reset_proxies=False):
        """
        Reads text file with proxy list and re-obtain proxy list if given
        file is empty. Also you can call force update on proxy list by
        setting reset_proxies keyword to True
        """
        if reset_proxies:
            cls._refresh_proxy_list()
            return cls._read_proxy_path_file(proxy_path)
        with codecs.open(proxy_path, 'r', encoding='utf8') as f:
            proxy_list = [line.strip() for line in f if line.strip()]
        if not proxy_list:
            logger.error('Proxy list is empty! Refreshing list')
            cls._refresh_proxy_list()
            return cls._read_proxy_path_file(proxy_path)
        return proxy_list

    @classmethod
    def from_crawler(cls, crawler):
        s = crawler.settings
        proxy_path = s.get('ROTATING_PROXY_LIST_PATH', None)
        if proxy_path is not None:
            proxy_list = cls._read_proxy_path_file(proxy_path)
        else:
            proxy_list = s.getlist('ROTATING_PROXY_LIST')
        if not proxy_list:
            raise NotConfigured()
        mw = cls(
            proxy_list=proxy_list,
            logstats_interval=s.getfloat(
                'ROTATING_PROXY_LOGSTATS_INTERVAL', 30),
            stop_if_no_proxies=s.getbool('ROTATING_PROXY_CLOSE_SPIDER', False),
            max_proxies_to_try=s.getint('ROTATING_PROXY_PAGE_RETRY_TIMES', 5),
            backoff_base=s.getfloat('ROTATING_PROXY_BACKOFF_BASE', 300),
            backoff_cap=s.getfloat('ROTATING_PROXY_BACKOFF_CAP', 3600),
            crawler=crawler,
        )
        crawler.signals.connect(mw.engine_started,
                                signal=signals.engine_started)
        crawler.signals.connect(mw.engine_stopped,
                                signal=signals.engine_stopped)
        return mw

    def process_request(self, request, spider):
        if 'proxy' in request.meta and not request.meta.get('_rotating_proxy'):
            return
        proxy = self.proxies.get_random()
        if not proxy:
            if self.stop_if_no_proxies:
                raise CloseSpider("no_proxies")
            else:
                logger.warning("No proxies available; Reseting proxy list")
                new_list = self._read_proxy_path_file(
                    ROTATING_PROXY_LIST_PATH, reset_proxies=True)
                backoff = self.proxies.backoff
                self.proxies = Proxies(
                    self.cleanup_proxy_list(new_list), backoff=backoff)
                proxy = self.proxies.get_random()
                if proxy is None:
                    logger.error("No proxies available even after a reset.")
                    raise CloseSpider("no_proxies_after_reset")

        request.meta['proxy'] = proxy
        request.meta['download_slot'] = self.get_proxy_slot(proxy)
        request.meta['_rotating_proxy'] = True

## proxy.py
import asyncio
import logging
import sys

from proxybroker import Broker

try:
    from scraper.settings import PROXY_COUNT, PROXY_COUNTRIES
except ModuleNotFoundError:
    import os
    sys.path.append(os.path.abspath('.'))
    from scraper.settings import PROXY_COUNT, PROXY_COUNTRIES


logging.basicConfig(stream=sys.stdout, level=logging.INFO,
                    format='[%(levelname)s]: %(message)s')


async def save(proxies, filename):
    """Save proxies to a file."""
    with open(filename, 'w') as f:
        while True:
            proxy = await proxies.get()
            if proxy is None:
                break
            f.write(f'{proxy.host}:{proxy.port}\n')


def main():
    proxies = asyncio.Queue()
    broker = Broker(proxies)
    tasks = asyncio.gather(broker.find(
        types=['HTTP', 'HTTPS'], limit=PROXY_COUNT, countries=PROXY_COUNTRIES),
        save(proxies, filename='_input/proxylist.txt'))
    loop = asyncio.get_event_loop()
    loop.run_until_complete(tasks)
    logging.info('Proxies obtained')


if __name__ == '__main__':
    logging.info(f'Obtaining {PROXY_COUNT} proxies via proxybroker')
    logging.info('Script settings can be edited in scrapper/settings.py')
    main()

## settings.py
import os

BOT_NAME = 'scraper'

SPIDER_MODULES = ['scraper.spiders']
NEWSPIDER_MODULE = 'scraper.spiders'

# Files and directories
BASE_DIR = os.getcwd()
INPUT_FOLDER = os.path.join(BASE_DIR, '_input')
OUTPUT_FOLDER = os.path.join(BASE_DIR, '_output')
APP_FOLDER = os.path.join(BASE_DIR, 'scraper')

# Rotating proxies
PROXY_COUNT = 20
PROXY_COUNTRIES = ['DE', 'CH', 'FR']
ROTATING_PROXY_LIST_PATH = os.path.join(INPUT_FOLDER, 'proxylist.txt')
ROTATING_PROXY_BACKOFF_BASE = 30
ROTATING_PROXY_PAGE_RETRY_TIMES = 20
ROTATING_PROXY_BACKOFF_CAP = 7200

# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0'

# Obey robots.txt rules
ROBOTSTXT_OBEY = True

# Configure maximum concurrent requests performed by Scrapy (default: 16)
CONCURRENT_REQUESTS = 16
DOWNLOAD_DELAY = 1.5
# CONCURRENT_REQUESTS_PER_DOMAIN = 1

DOWNLOADER_MIDDLEWARES = {
    'scraper.middlewares.AutoRotatingProxyMiddleware': 610,
    'rotating_proxies.middlewares.BanDetectionMiddleware': 620,
}
	#
	# Example of automatical rotating proxy middleware for scrapy-rotating-proxies using proxybroker
	#

	import codecs
	import logging
	from subprocess import call

	from rotating_proxies.expire import Proxies
	from rotating_proxies.middlewares import RotatingProxyMiddleware
	from scrapy import signals
	from scrapy.exceptions import CloseSpider, NotConfigured

	from scraper.settings import ROTATING_PROXY_LIST_PATH

	logger = logging.getLogger(__name__)


	class AutoRotatingProxyMiddleware(RotatingProxyMiddleware):

	def reanimate_proxies(self):
	"""Prevent dead proxies from reanimating"""

	@staticmethod
	def _refresh_proxy_list():
	"""Calls external script for obtaining new list of proxies"""
	call(['python', 'scraper/proxy.py'])

	@classmethod
	def _read_proxy_path_file(cls, proxy_path, reset_proxies=False):
	"""
	Reads text file with proxy list and re-obtain proxy list if given
	file is empty. Also you can call force update on proxy list by
	setting reset_proxies keyword to True
	"""
	if reset_proxies:
	cls._refresh_proxy_list()
	return cls._read_proxy_path_file(proxy_path)
	with codecs.open(proxy_path, 'r', encoding='utf8') as f:
	proxy_list = [line.strip() for line in f if line.strip()]
	if not proxy_list:
	logger.error('Proxy list is empty! Refreshing list')
	cls._refresh_proxy_list()
	return cls._read_proxy_path_file(proxy_path)
	return proxy_list

	@classmethod
	def from_crawler(cls, crawler):
	s = crawler.settings
	proxy_path = s.get('ROTATING_PROXY_LIST_PATH', None)
	if proxy_path is not None:
	proxy_list = cls._read_proxy_path_file(proxy_path)
	else:
	proxy_list = s.getlist('ROTATING_PROXY_LIST')
	if not proxy_list:
	raise NotConfigured()
	mw = cls(
	proxy_list=proxy_list,
	logstats_interval=s.getfloat(
	'ROTATING_PROXY_LOGSTATS_INTERVAL', 30),
	stop_if_no_proxies=s.getbool('ROTATING_PROXY_CLOSE_SPIDER', False),
	max_proxies_to_try=s.getint('ROTATING_PROXY_PAGE_RETRY_TIMES', 5),
	backoff_base=s.getfloat('ROTATING_PROXY_BACKOFF_BASE', 300),
	backoff_cap=s.getfloat('ROTATING_PROXY_BACKOFF_CAP', 3600),
	crawler=crawler,
	)
	crawler.signals.connect(mw.engine_started,
	signal=signals.engine_started)
	crawler.signals.connect(mw.engine_stopped,
	signal=signals.engine_stopped)
	return mw

	def process_request(self, request, spider):
	if 'proxy' in request.meta and not request.meta.get('_rotating_proxy'):
	return
	proxy = self.proxies.get_random()
	if not proxy:
	if self.stop_if_no_proxies:
	raise CloseSpider("no_proxies")
	else:
	logger.warning("No proxies available; Reseting proxy list")
	new_list = self._read_proxy_path_file(
	ROTATING_PROXY_LIST_PATH, reset_proxies=True)
	backoff = self.proxies.backoff
	self.proxies = Proxies(
	self.cleanup_proxy_list(new_list), backoff=backoff)
	proxy = self.proxies.get_random()
	if proxy is None:
	logger.error("No proxies available even after a reset.")
	raise CloseSpider("no_proxies_after_reset")

	request.meta['proxy'] = proxy
	request.meta['download_slot'] = self.get_proxy_slot(proxy)
	request.meta['_rotating_proxy'] = True
	import asyncio
	import logging
	import sys

	from proxybroker import Broker

	try:
	from scraper.settings import PROXY_COUNT, PROXY_COUNTRIES
	except ModuleNotFoundError:
	import os
	sys.path.append(os.path.abspath('.'))
	from scraper.settings import PROXY_COUNT, PROXY_COUNTRIES


	logging.basicConfig(stream=sys.stdout, level=logging.INFO,
	format='[%(levelname)s]: %(message)s')


	async def save(proxies, filename):
	"""Save proxies to a file."""
	with open(filename, 'w') as f:
	while True:
	proxy = await proxies.get()
	if proxy is None:
	break
	f.write(f'{proxy.host}:{proxy.port}\n')


	def main():
	proxies = asyncio.Queue()
	broker = Broker(proxies)
	tasks = asyncio.gather(broker.find(
	types=['HTTP', 'HTTPS'], limit=PROXY_COUNT, countries=PROXY_COUNTRIES),
	save(proxies, filename='_input/proxylist.txt'))
	loop = asyncio.get_event_loop()
	loop.run_until_complete(tasks)
	logging.info('Proxies obtained')


	if __name__ == '__main__':
	logging.info(f'Obtaining {PROXY_COUNT} proxies via proxybroker')
	logging.info('Script settings can be edited in scrapper/settings.py')
	main()
	import os

	BOT_NAME = 'scraper'

	SPIDER_MODULES = ['scraper.spiders']
	NEWSPIDER_MODULE = 'scraper.spiders'

	# Files and directories
	BASE_DIR = os.getcwd()
	INPUT_FOLDER = os.path.join(BASE_DIR, '_input')
	OUTPUT_FOLDER = os.path.join(BASE_DIR, '_output')
	APP_FOLDER = os.path.join(BASE_DIR, 'scraper')

	# Rotating proxies
	PROXY_COUNT = 20
	PROXY_COUNTRIES = ['DE', 'CH', 'FR']
	ROTATING_PROXY_LIST_PATH = os.path.join(INPUT_FOLDER, 'proxylist.txt')
	ROTATING_PROXY_BACKOFF_BASE = 30
	ROTATING_PROXY_PAGE_RETRY_TIMES = 20
	ROTATING_PROXY_BACKOFF_CAP = 7200

	# Crawl responsibly by identifying yourself (and your website) on the user-agent
	USER_AGENT = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0'

	# Obey robots.txt rules
	ROBOTSTXT_OBEY = True

	# Configure maximum concurrent requests performed by Scrapy (default: 16)
	CONCURRENT_REQUESTS = 16
	DOWNLOAD_DELAY = 1.5
	# CONCURRENT_REQUESTS_PER_DOMAIN = 1

	DOWNLOADER_MIDDLEWARES = {
	'scraper.middlewares.AutoRotatingProxyMiddleware': 610,
	'rotating_proxies.middlewares.BanDetectionMiddleware': 620,
	}