-
-
Save kbarre123/8d4ad5fd9d41b9e84c69 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from random import choice | |
from scrapy import signals | |
from scrapy.exceptions import NotConfigured | |
class RotateUserAgentMiddleware(object): | |
"""Rotate user-agent for each request.""" | |
def __init__(self, user_agents): | |
self.enabled = False | |
self.user_agents = user_agents | |
@classmethod | |
def from_crawler(cls, crawler): | |
user_agents = crawler.settings.get('USER_AGENT_CHOICES', []) | |
if not user_agents: | |
raise NotConfigured("USER_AGENT_CHOICES not set or empty") | |
o = cls(user_agents) | |
crawler.signals.connect(o.spider_opened, signal=signals.spider_opened) | |
return o | |
def spider_opened(self, spider): | |
self.enabled = getattr(spider, 'rotate_user_agent', self.enabled) | |
def process_request(self, request, spider): | |
if not self.enabled or not self.user_agents: | |
return | |
request.headers['user-agent'] = choice(self.user_agents) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from scrapy.spider import Spider | |
class ProjectSpider(Spider): | |
name = 'project-website.com' | |
rotate_user_agent = True |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
DOWNLOADER_MIDDLEWARES = { | |
'project.middlewares.RotateUserAgentMiddleware': 110, | |
} | |
USER_AGENT_CHOICES = [ | |
'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:23.0) Gecko/20100101 Firefox/23.0', | |
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.62 Safari/537.36', | |
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; WOW64; Trident/6.0)', | |
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.146 Safari/537.36', | |
'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.146 Safari/537.36', | |
'Mozilla/5.0 (X11; Linux x86_64; rv:24.0) Gecko/20140205 Firefox/24.0 Iceweasel/24.3.0', | |
'Mozilla/5.0 (Windows NT 6.2; WOW64; rv:28.0) Gecko/20100101 Firefox/28.0', | |
'Mozilla/5.0 (Windows NT 6.2; WOW64; rv:28.0) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2', | |
# New user agents from http://whatsmyuseragent.com/commonuseragents | |
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36', | |
'Mozilla/5.0 ;Windows NT 6.1; WOW64; Trident/7.0; rv:11.0; like Gecko', | |
'Mozilla/5.0 ;iPhone; CPU iPhone OS 8_1_2 like Mac OS X; AppleWebKit/600.1.4 ;KHTML, like Gecko; Version/8.0 Mobile/12B440 Safari/600.1.4', | |
'Mozilla/5.0 ;Windows NT 6.1; WOW64; AppleWebKit/537.36 ;KHTML, like Gecko; Chrome/36.0.1985.143 Safari/537.36', | |
'Mozilla/5.0 ;Windows NT 6.3; WOW64; AppleWebKit/537.36 ;KHTML, like Gecko; Chrome/39.0.2171.95 Safari/537.36', | |
'Mozilla/5.0 ;Windows NT 6.1; rv:34.0; Gecko/20100101 Firefox/34.0', | |
'Mozilla/5.0 ;Windows NT 6.1; WOW64; rv:34.0; Gecko/20100101 Firefox/34.0', | |
'Mozilla/5.0 ;compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/7.0;', | |
'Mozilla/5.0 ;Windows NT 6.2; WOW64; rv:27.0; Gecko/20100101 Firefox/27.0', | |
'Mozilla/5.0 ;compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html;', | |
'Mozilla/5.0 ;Windows NT 6.3; WOW64; Trident/7.0; rv:11.0; like Gecko', | |
'Mozilla/5.0 ;iPhone; CPU iPhone OS 7_1_2 like Mac OS X; AppleWebKit/537.51.2 ;KHTML, like Gecko; Version/7.0 Mobile/11D257 Safari/9537.53', | |
'Mozilla/5.0 ;Windows NT 6.1; rv:35.0; Gecko/20100101 Firefox/35.0', | |
'Mozilla/5.0 ;Windows NT 6.1; WOW64; rv:35.0; Gecko/20100101 Firefox/35.0', | |
'Mozilla/5.0 ;Windows NT 6.3; WOW64; rv:34.0; Gecko/20100101 Firefox/34.0', | |
'Mozilla/5.0 ;iPad; CPU OS 8_1_2 like Mac OS X; AppleWebKit/600.1.4 ;KHTML, like Gecko; Version/8.0 Mobile/12B440 Safari/600.1.4', | |
'Mozilla/5.0 ;Windows NT 6.1; AppleWebKit/537.36 ;KHTML, like Gecko; Chrome/39.0.2171.95 Safari/537.36', | |
'Mozilla/5.0 ;Macintosh; Intel Mac OS X 10_10_1; AppleWebKit/600.2.5 ;KHTML, like Gecko; Version/8.0.2 Safari/600.2.5', | |
'Mozilla/5.0 ;Windows NT 6.3; WOW64; rv:35.0; Gecko/20100101 Firefox/35.0', | |
'Mozilla/5.0 ;Windows NT 6.1; WOW64; AppleWebKit/537.36 ;KHTML, like Gecko; Chrome/39.0.2171.99 Safari/537.36', | |
'Mozilla/5.0 ;Macintosh; Intel Mac OS X 10_10_1; AppleWebKit/537.36 ;KHTML, like Gecko; Chrome/39.0.2171.95 Safari/537.36', | |
'Mozilla/5.0 ;iPhone; CPU iPhone OS 8_0_2 like Mac OS X; AppleWebKit/600.1.4 ;KHTML, like Gecko; Version/8.0 Mobile/12A405 Safari/600.1.4', | |
'Mozilla/5.0 ;Windows NT 6.1; WOW64; AppleWebKit/537.36 ;KHTML, like Gecko; Chrome/40.0.2214.93 Safari/537.36', | |
'Mozilla/5.0 ;Windows NT 5.1; rv:34.0; Gecko/20100101 Firefox/34.0', | |
'Mozilla/5.0 ;Windows NT 5.1; rv:35.0; Gecko/20100101 Firefox/35.0' | |
] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment