Skip to content

Instantly share code, notes, and snippets.

@kbarre123
Forked from seagatesoft/middlewares.py
Last active August 29, 2015 14:15
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kbarre123/8d4ad5fd9d41b9e84c69 to your computer and use it in GitHub Desktop.
Save kbarre123/8d4ad5fd9d41b9e84c69 to your computer and use it in GitHub Desktop.
from random import choice
from scrapy import signals
from scrapy.exceptions import NotConfigured
class RotateUserAgentMiddleware(object):
"""Rotate user-agent for each request."""
def __init__(self, user_agents):
self.enabled = False
self.user_agents = user_agents
@classmethod
def from_crawler(cls, crawler):
user_agents = crawler.settings.get('USER_AGENT_CHOICES', [])
if not user_agents:
raise NotConfigured("USER_AGENT_CHOICES not set or empty")
o = cls(user_agents)
crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
return o
def spider_opened(self, spider):
self.enabled = getattr(spider, 'rotate_user_agent', self.enabled)
def process_request(self, request, spider):
if not self.enabled or not self.user_agents:
return
request.headers['user-agent'] = choice(self.user_agents)
from scrapy.spider import Spider
class ProjectSpider(Spider):
name = 'project-website.com'
rotate_user_agent = True
DOWNLOADER_MIDDLEWARES = {
'project.middlewares.RotateUserAgentMiddleware': 110,
}
USER_AGENT_CHOICES = [
'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:23.0) Gecko/20100101 Firefox/23.0',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.62 Safari/537.36',
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; WOW64; Trident/6.0)',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.146 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.146 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64; rv:24.0) Gecko/20140205 Firefox/24.0 Iceweasel/24.3.0',
'Mozilla/5.0 (Windows NT 6.2; WOW64; rv:28.0) Gecko/20100101 Firefox/28.0',
'Mozilla/5.0 (Windows NT 6.2; WOW64; rv:28.0) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2',
# New user agents from http://whatsmyuseragent.com/commonuseragents
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36',
'Mozilla/5.0 ;Windows NT 6.1; WOW64; Trident/7.0; rv:11.0; like Gecko',
'Mozilla/5.0 ;iPhone; CPU iPhone OS 8_1_2 like Mac OS X; AppleWebKit/600.1.4 ;KHTML, like Gecko; Version/8.0 Mobile/12B440 Safari/600.1.4',
'Mozilla/5.0 ;Windows NT 6.1; WOW64; AppleWebKit/537.36 ;KHTML, like Gecko; Chrome/36.0.1985.143 Safari/537.36',
'Mozilla/5.0 ;Windows NT 6.3; WOW64; AppleWebKit/537.36 ;KHTML, like Gecko; Chrome/39.0.2171.95 Safari/537.36',
'Mozilla/5.0 ;Windows NT 6.1; rv:34.0; Gecko/20100101 Firefox/34.0',
'Mozilla/5.0 ;Windows NT 6.1; WOW64; rv:34.0; Gecko/20100101 Firefox/34.0',
'Mozilla/5.0 ;compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/7.0;',
'Mozilla/5.0 ;Windows NT 6.2; WOW64; rv:27.0; Gecko/20100101 Firefox/27.0',
'Mozilla/5.0 ;compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html;',
'Mozilla/5.0 ;Windows NT 6.3; WOW64; Trident/7.0; rv:11.0; like Gecko',
'Mozilla/5.0 ;iPhone; CPU iPhone OS 7_1_2 like Mac OS X; AppleWebKit/537.51.2 ;KHTML, like Gecko; Version/7.0 Mobile/11D257 Safari/9537.53',
'Mozilla/5.0 ;Windows NT 6.1; rv:35.0; Gecko/20100101 Firefox/35.0',
'Mozilla/5.0 ;Windows NT 6.1; WOW64; rv:35.0; Gecko/20100101 Firefox/35.0',
'Mozilla/5.0 ;Windows NT 6.3; WOW64; rv:34.0; Gecko/20100101 Firefox/34.0',
'Mozilla/5.0 ;iPad; CPU OS 8_1_2 like Mac OS X; AppleWebKit/600.1.4 ;KHTML, like Gecko; Version/8.0 Mobile/12B440 Safari/600.1.4',
'Mozilla/5.0 ;Windows NT 6.1; AppleWebKit/537.36 ;KHTML, like Gecko; Chrome/39.0.2171.95 Safari/537.36',
'Mozilla/5.0 ;Macintosh; Intel Mac OS X 10_10_1; AppleWebKit/600.2.5 ;KHTML, like Gecko; Version/8.0.2 Safari/600.2.5',
'Mozilla/5.0 ;Windows NT 6.3; WOW64; rv:35.0; Gecko/20100101 Firefox/35.0',
'Mozilla/5.0 ;Windows NT 6.1; WOW64; AppleWebKit/537.36 ;KHTML, like Gecko; Chrome/39.0.2171.99 Safari/537.36',
'Mozilla/5.0 ;Macintosh; Intel Mac OS X 10_10_1; AppleWebKit/537.36 ;KHTML, like Gecko; Chrome/39.0.2171.95 Safari/537.36',
'Mozilla/5.0 ;iPhone; CPU iPhone OS 8_0_2 like Mac OS X; AppleWebKit/600.1.4 ;KHTML, like Gecko; Version/8.0 Mobile/12A405 Safari/600.1.4',
'Mozilla/5.0 ;Windows NT 6.1; WOW64; AppleWebKit/537.36 ;KHTML, like Gecko; Chrome/40.0.2214.93 Safari/537.36',
'Mozilla/5.0 ;Windows NT 5.1; rv:34.0; Gecko/20100101 Firefox/34.0',
'Mozilla/5.0 ;Windows NT 5.1; rv:35.0; Gecko/20100101 Firefox/35.0'
]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment