zhengxiaowai/test.py

## test.py
# coding: u8

import json
from requests_html import HTMLSession

session = HTMLSession()


def get_name(html):
	name = html.find('body > div.container > div > div:nth-child(1) > div > div > div.col-md-8 > div > div.book-title > h2', first=True).text
	name = name.strip()
	if name.endswith('[预售]'):
		return name[0:-4].strip()

	return name


def get_isbn(html):
	lis = html.find('body > div.container > div > div.col-md-3.pull-right.side > div:nth-child(4) > div.block-body > ul > li')

	isbn = ''
	for li in lis:
		if li.text.startswith('书\u3000\u3000号'):
			isbn = li.text[4:]

	return isbn

def get_price(html):
	price = html.find('body > div.container > div > div.col-md-3.pull-right.side > div.book-approaches > dl:nth-child(1) > dd ', first=True)
	return price.text[1:]


is_continue = True
page = 0
results = []
while is_continue:
	r = session.get('http://www.ituring.com.cn/book', params={'tab': 'ebook', 'page': page, 'sort': 'new'})
	ebooks_ele = r.html.find('#tab-book > div.col-md-9.col-sm-9.main > div.g-main > div > ul > li > div.book-img > a')
	for e in ebooks_ele:
		link = e.absolute_links.pop()
		print(link)
		ebook_page = session.get(link)
		name = get_name(ebook_page.html)
		isbn = get_isbn(ebook_page.html)
		price = get_price(ebook_page.html)

		item = {'name': name, 'isbn': isbn, 'price': price}
		print(item)
		results.append(item)

	next_page = r.html.find('.PagedList-skipToNext')
	if next_page:
		page += 1
	else:
		is_continue = False

with open('results.json', 'w') as f:
	json.dump(results, f, indent=4, ensure_ascii=False)
	# coding: u8

	import json
	from requests_html import HTMLSession

	session = HTMLSession()


	def get_name(html):
	name = html.find('body > div.container > div > div:nth-child(1) > div > div > div.col-md-8 > div > div.book-title > h2', first=True).text
	name = name.strip()
	if name.endswith('[预售]'):
	return name[0:-4].strip()

	return name



	def get_isbn(html):
	lis = html.find('body > div.container > div > div.col-md-3.pull-right.side > div:nth-child(4) > div.block-body > ul > li')

	isbn = ''
	for li in lis:
	if li.text.startswith('书\u3000\u3000号'):
	isbn = li.text[4:]

	return isbn

	def get_price(html):
	price = html.find('body > div.container > div > div.col-md-3.pull-right.side > div.book-approaches > dl:nth-child(1) > dd ', first=True)
	return price.text[1:]



	is_continue = True
	page = 0
	results = []
	while is_continue:
	r = session.get('http://www.ituring.com.cn/book', params={'tab': 'ebook', 'page': page, 'sort': 'new'})
	ebooks_ele = r.html.find('#tab-book > div.col-md-9.col-sm-9.main > div.g-main > div > ul > li > div.book-img > a')
	for e in ebooks_ele:
	link = e.absolute_links.pop()
	print(link)
	ebook_page = session.get(link)
	name = get_name(ebook_page.html)
	isbn = get_isbn(ebook_page.html)
	price = get_price(ebook_page.html)

	item = {'name': name, 'isbn': isbn, 'price': price}
	print(item)
	results.append(item)

	next_page = r.html.find('.PagedList-skipToNext')
	if next_page:
	page += 1
	else:
	is_continue = False

	with open('results.json', 'w') as f:
	json.dump(results, f, indent=4, ensure_ascii=False)