Last active
August 27, 2019 02:34
-
-
Save tevino/608655e3cda56af099b54dd58b508647 to your computer and use it in GitHub Desktop.
A spider to get CASIO(including G-SHOCK) watch specs, to help you make your choice.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import scrapy | |
def parse_date(s): | |
if s: | |
s = s.replace('年', '').replace('月', '').replace('NEW', '') | |
if len(s) == 5: | |
month = s[-1] | |
s = s[:4] + "0" + month # add a leading zero to month | |
s = int(s) | |
return s | |
def parse_price(s): | |
if s: | |
s = int(s.replace('价格:','').replace('元','').replace(',', '')) | |
return s | |
def parse_is_new(s): | |
if s: | |
return 'NEW' in s or 'COMMING SOON' in s | |
return False | |
def parse_is_square(watch): | |
return 'shock' in watch['detail_page'] and bool(re.match(r'.*5\d\d\d.*', watch['name'])) | |
class WatchSpider(scrapy.Spider): | |
name = 'watch_spider' | |
start_urls = ['https://www.casio.com.cn/wat/search.html'] | |
def parse(self, response): | |
for watch in response.css('.model-list .column'): | |
item = { | |
'name': watch.css('h5.t-size-x-small::text').get(), | |
'image_url': response.urljoin(watch.css('.figure img::attr(src)').get()), | |
'price': parse_price(watch.css('.info a p::text').get()), | |
'release_date': parse_date(watch.css('.label-alert::text').get()), | |
'is_new': parse_is_new(watch.css('.label-alert::text').get()), | |
'features': watch.css('.feature span::text').getall(), | |
'detail_page': response.urljoin(watch.css('.info a::attr(href)').get()), | |
} | |
item['is_square'] = parse_is_square(item) | |
yield item | |
for page_link in response.css('.pagination a'): | |
if '下一页' in page_link.get(): | |
yield response.follow(page_link, self.parse) | |
break |
Author
tevino
commented
Aug 26, 2019
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment