Skip to content

Instantly share code, notes, and snippets.

@ResolveWang
Created January 20, 2018 13:16
Show Gist options
  • Save ResolveWang/37f7d31d569a53f0b7dbf23c444eecbf to your computer and use it in GitHub Desktop.
Save ResolveWang/37f7d31d569a53f0b7dbf23c444eecbf to your computer and use it in GitHub Desktop.
scrapy download speed checker
class GoogleNewsSpider(RedisSpider, SpiderMixin):
"""spider for google news"""
name = 'google_news'
redis_key = 'search_engine:google_news'
req_num = 10
proxy_mode = 2
custom_settings = {
"DOWNLOAD_DELAY": 20,
"CONCURRENT_REQUESTS_PER_DOMAIN": 1
}
def parse(self, response):
"""parse urls from google news search page"""
print(self.crawler.engine.downloader.slots)
delay = self.crawler.engine.downloader.slots["google.com"].delay
concurrency = self.crawler.engine.downloader.slots["google.com"].concurrency
self.logger.info('delay is {}, concurrency is {}'.format(delay, concurrency))
url_format, search_word, count, mode, site = self.get_args_from_meta(response)
# no consider about parser error
urls = response.css('.g .r a::attr(href)').extract()
sites = response.css('.g ._PHs').xpath('./text()').extract()
site_urls = list(zip(urls, sites))
for url, site in site_urls:
yield SeedURLItem(url=url, site=site, keyword=search_word)
is_to_crawl = True if len(site_urls) > 0 else False
if is_to_crawl:
url = self.construct_url(url_format, search_word, count)
yield Request(url, callback=self.parse,
meta={
'url_format': url_format,
'search_word': search_word,
'count': count + 1,
'mode': mode,
'site': site
})
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment