Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Scrapy middlewares for random agent list and proxy server usage.
import os
import random
from scrapy.conf import settings
class RandomUserAgentMiddleware(object):
def process_request(self, request, spider):
ua = random.choice(settings.get('USER_AGENT_LIST'))
if ua:
request.headers.setdefault('User-Agent', ua)
class ProxyMiddleware(object):
def process_request(self, request, spider):
request.meta['proxy'] = settings.get('HTTP_PROXY')
# More comprehensive list can be found at
# http://techpatterns.com/forums/about304.html
USER_AGENT_LIST = [
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.36 Safari/535.7',
'Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:16.0) Gecko/16.0 Firefox/16.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/534.55.3 (KHTML, like Gecko) Version/5.1.3 Safari/534.53.10'
]
DOWNLOADER_MIDDLEWARES = {
'myproject.middlewares.RandomUserAgentMiddleware': 400,
'myproject.middlewares.ProxyMiddleware': 410,
'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware': None,
# Disable compression middleware, so the actual HTML pages are cached
}
@BrongoObenge

This comment has been minimized.

Copy link

@BrongoObenge BrongoObenge commented Dec 22, 2014

Hello,
Can i see what you have in from scrapy.conf import settings?
Thank you in advance.

Edit: Ignore this, my bad

@vijeth27

This comment has been minimized.

Copy link

@vijeth27 vijeth27 commented Nov 8, 2020

To anybody trying to implement this using Scrapy 1.0 or beyond settings.get throws an attribute error. [https://stackoverflow.com/questions/32984597/scrapy-attributeerror-settings-object-has-no-attribute-update-settings](Much the same error as this.)

A workaround can be achieved by using the following code in your middleware.py. (Will add this in a public repo soon. Apologies for terrible code format.)
`
import os
import random
from scrapy.settings import Settings
class RandomUserAgentMiddleware(object):
def init(self, crawler):
self._user_agent_list=crawler.settings.get('USER_AGENT_LIST')
self._http_proxy=crawler.settings.get('HTTP_PROXY')
self.crawler = crawler

@classmethod
def from_crawler(cls, crawler):
	return cls(crawler)
def process_request(self, request, spider):
	ua  = random.choice(self._user_agent_list)
	if ua:
		request.headers.setdefault('User-Agent', ua)

class ProxyMiddleware(object):
def init(self, crawler):
self._user_agent_list=crawler.settings.get('USER_AGENT_LIST')
self._http_proxy=crawler.settings.get('HTTP_PROXY')
self.crawler = crawler

@classmethod
def from_crawler(cls, crawler):
	return cls(crawler)
	
def process_request(self, request, spider):
	request.meta['proxy'] = self._http_proxy`
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.