Skip to content

Instantly share code, notes, and snippets.

@tevino
Last active August 27, 2019 02:34
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save tevino/608655e3cda56af099b54dd58b508647 to your computer and use it in GitHub Desktop.
Save tevino/608655e3cda56af099b54dd58b508647 to your computer and use it in GitHub Desktop.
A spider to get CASIO(including G-SHOCK) watch specs, to help you make your choice.
import re
import scrapy
def parse_date(s):
if s:
s = s.replace('年', '').replace('月', '').replace('NEW', '')
if len(s) == 5:
month = s[-1]
s = s[:4] + "0" + month # add a leading zero to month
s = int(s)
return s
def parse_price(s):
if s:
s = int(s.replace('价格:','').replace('元','').replace(',', ''))
return s
def parse_is_new(s):
if s:
return 'NEW' in s or 'COMMING SOON' in s
return False
def parse_is_square(watch):
return 'shock' in watch['detail_page'] and bool(re.match(r'.*5\d\d\d.*', watch['name']))
class WatchSpider(scrapy.Spider):
name = 'watch_spider'
start_urls = ['https://www.casio.com.cn/wat/search.html']
def parse(self, response):
for watch in response.css('.model-list .column'):
item = {
'name': watch.css('h5.t-size-x-small::text').get(),
'image_url': response.urljoin(watch.css('.figure img::attr(src)').get()),
'price': parse_price(watch.css('.info a p::text').get()),
'release_date': parse_date(watch.css('.label-alert::text').get()),
'is_new': parse_is_new(watch.css('.label-alert::text').get()),
'features': watch.css('.feature span::text').getall(),
'detail_page': response.urljoin(watch.css('.info a::attr(href)').get()),
}
item['is_square'] = parse_is_square(item)
yield item
for page_link in response.css('.pagination a'):
if '下一页' in page_link.get():
yield response.follow(page_link, self.parse)
break
@tevino
Copy link
Author

tevino commented Aug 26, 2019

> pip install scrapy

> scrapy runspider casio-watch-spider.py -o watches.json

> python

>>> import json
>>> ws = json.load(open('watches.json'))
>>> for w in ws:
...     if w['is_square'] and set(('太阳能动力', '六局电波', '蓝牙')).issubset(w['features']):
...         print(w)
... 

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment