Skip to content

Instantly share code, notes, and snippets.

@ahlusar1989
Forked from seagatesoft/middlewares.py
Created October 11, 2017 19:47
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ahlusar1989/1b896af7ffb59c13c415d981c3731bb9 to your computer and use it in GitHub Desktop.
Save ahlusar1989/1b896af7ffb59c13c415d981c3731bb9 to your computer and use it in GitHub Desktop.
An example of RotateUserAgentMiddleware
from random import choice
from scrapy import signals
from scrapy.exceptions import NotConfigured
class RotateUserAgentMiddleware(object):
"""Rotate user-agent for each request."""
def __init__(self, user_agents):
self.enabled = False
self.user_agents = user_agents
@classmethod
def from_crawler(cls, crawler):
user_agents = crawler.settings.get('USER_AGENT_CHOICES', [])
if not user_agents:
raise NotConfigured("USER_AGENT_CHOICES not set or empty")
o = cls(user_agents)
crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
return o
def spider_opened(self, spider):
self.enabled = getattr(spider, 'rotate_user_agent', self.enabled)
def process_request(self, request, spider):
if not self.enabled or not self.user_agents:
return
request.headers['user-agent'] = choice(self.user_agents)
from scrapy.spider import Spider
class ProjectSpider(Spider):
name = 'project-website.com'
rotate_user_agent = True
DOWNLOADER_MIDDLEWARES = {
'project.middlewares.RotateUserAgentMiddleware': 110,
}
USER_AGENT_CHOICES = [
'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:23.0) Gecko/20100101 Firefox/23.0',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.62 Safari/537.36',
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; WOW64; Trident/6.0)',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.146 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.146 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64; rv:24.0) Gecko/20140205 Firefox/24.0 Iceweasel/24.3.0',
'Mozilla/5.0 (Windows NT 6.2; WOW64; rv:28.0) Gecko/20100101 Firefox/28.0',
'Mozilla/5.0 (Windows NT 6.2; WOW64; rv:28.0) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2',
]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment