Skip to content

Instantly share code, notes, and snippets.

@zhengxiaowai
Created March 20, 2018 09:19
Show Gist options
  • Save zhengxiaowai/ce411872b6db1c155a650702d1118dbf to your computer and use it in GitHub Desktop.
Save zhengxiaowai/ce411872b6db1c155a650702d1118dbf to your computer and use it in GitHub Desktop.
# coding: u8
import json
from requests_html import HTMLSession
session = HTMLSession()
def get_name(html):
name = html.find('body > div.container > div > div:nth-child(1) > div > div > div.col-md-8 > div > div.book-title > h2', first=True).text
name = name.strip()
if name.endswith('[预售]'):
return name[0:-4].strip()
return name
def get_isbn(html):
lis = html.find('body > div.container > div > div.col-md-3.pull-right.side > div:nth-child(4) > div.block-body > ul > li')
isbn = ''
for li in lis:
if li.text.startswith('书\u3000\u3000号'):
isbn = li.text[4:]
return isbn
def get_price(html):
price = html.find('body > div.container > div > div.col-md-3.pull-right.side > div.book-approaches > dl:nth-child(1) > dd ', first=True)
return price.text[1:]
is_continue = True
page = 0
results = []
while is_continue:
r = session.get('http://www.ituring.com.cn/book', params={'tab': 'ebook', 'page': page, 'sort': 'new'})
ebooks_ele = r.html.find('#tab-book > div.col-md-9.col-sm-9.main > div.g-main > div > ul > li > div.book-img > a')
for e in ebooks_ele:
link = e.absolute_links.pop()
print(link)
ebook_page = session.get(link)
name = get_name(ebook_page.html)
isbn = get_isbn(ebook_page.html)
price = get_price(ebook_page.html)
item = {'name': name, 'isbn': isbn, 'price': price}
print(item)
results.append(item)
next_page = r.html.find('.PagedList-skipToNext')
if next_page:
page += 1
else:
is_continue = False
with open('results.json', 'w') as f:
json.dump(results, f, indent=4, ensure_ascii=False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment