Created
January 20, 2018 13:16
-
-
Save ResolveWang/37f7d31d569a53f0b7dbf23c444eecbf to your computer and use it in GitHub Desktop.
scrapy download speed checker
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class GoogleNewsSpider(RedisSpider, SpiderMixin): | |
"""spider for google news""" | |
name = 'google_news' | |
redis_key = 'search_engine:google_news' | |
req_num = 10 | |
proxy_mode = 2 | |
custom_settings = { | |
"DOWNLOAD_DELAY": 20, | |
"CONCURRENT_REQUESTS_PER_DOMAIN": 1 | |
} | |
def parse(self, response): | |
"""parse urls from google news search page""" | |
print(self.crawler.engine.downloader.slots) | |
delay = self.crawler.engine.downloader.slots["google.com"].delay | |
concurrency = self.crawler.engine.downloader.slots["google.com"].concurrency | |
self.logger.info('delay is {}, concurrency is {}'.format(delay, concurrency)) | |
url_format, search_word, count, mode, site = self.get_args_from_meta(response) | |
# no consider about parser error | |
urls = response.css('.g .r a::attr(href)').extract() | |
sites = response.css('.g ._PHs').xpath('./text()').extract() | |
site_urls = list(zip(urls, sites)) | |
for url, site in site_urls: | |
yield SeedURLItem(url=url, site=site, keyword=search_word) | |
is_to_crawl = True if len(site_urls) > 0 else False | |
if is_to_crawl: | |
url = self.construct_url(url_format, search_word, count) | |
yield Request(url, callback=self.parse, | |
meta={ | |
'url_format': url_format, | |
'search_word': search_word, | |
'count': count + 1, | |
'mode': mode, | |
'site': site | |
}) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment