Skip to content

Instantly share code, notes, and snippets.

@pradeepbn
Created July 25, 2016 09:31
Show Gist options
  • Save pradeepbn/98c73d99b2f0c0dab845567bba3c7b97 to your computer and use it in GitHub Desktop.
Save pradeepbn/98c73d99b2f0c0dab845567bba3c7b97 to your computer and use it in GitHub Desktop.
Scrapy settings
BOT_NAME = 'BusinessContacts'
SPIDER_MODULES = ['BusinessContacts.spiders']
NEWSPIDER_MODULE = 'BusinessContacts.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'BusinessContacts (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 1
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 4
## The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 64
#CONCURRENT_REQUESTS_PER_IP = 16
#CONCURRENT_ITEMS = 400
RETRY_ENABLED = True
COOKIES_ENABLED = True
### More comprehensive list can be found at
### http://techpatterns.com/forums/about304.html
USER_AGENT_LIST = [
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.36 Safari/535.7',
'Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:16.0) Gecko/16.0 Firefox/16.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/534.55.3 (KHTML, like Gecko) Version/5.1.3 Safari/534.53.10'
]
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
#'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
#'Accept-Language': 'en',
#'X-Crawlera-UA' : 'desktop',
'X-Crawlera-Max-Retries': 2,
#'X-Crawlera-Debug': 'ua',
#'X-Crawlera-JobId' : 999,
}
# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
SPIDER_MIDDLEWARES = {
'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
}
# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#HTTP_PROXY = 'http://127.0.0.1:8123'
#DOWNLOADER_MIDDLEWARES = {
# # Disable compression middleware, so the actual HTML pages are cached
#}
DOWNLOADER_MIDDLEWARES = {
'scrapy_crawlera.CrawleraMiddleware': 300,
'scrapy_splash.SplashCookiesMiddleware': 723,
'scrapy_splash.SplashMiddleware': 725,
}
SPLASH_URL = 'http://localhost:8050/'
DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter'
HTTPCACHE_STORAGE = 'scrapy_splash.SplashAwareFSCacheStorage'
CRAWLERA_ENABLED = True
CRAWLERA_APIKEY = '<API_KEY>'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment