creamidea/request-jssvc-lib.py

## request-jssvc-lib.py
#!/usr/local/bin/python
# -*- coding: utf-8 -*-

from time import sleep
from argparse import ArgumentParser
import requests
from bs4 import BeautifulSoup
from bs4.element import Tag

class Search(object):
    """
    Search the books
    """
    api = "http://opac.jssvc.edu.cn:8080/opac/openlink.php"
    headers = {
        'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'
    }
    cookies = None
    books = []

    def __init__(self):
        """
        Init Search
        """
        pass

    def toDict(self, books = []):
        """
        将 bs4.element.Tag 类型装换成 字典 类型
        """
        return list(map(self.getInfo, books))

    def getInfo(self, book):
        """
        从 bs4.element.Tag 中提取出 书 的信息
        """
        children = book.findChildren()
        h3 = children[0]

        title = h3.find('a')
        link = 'http://opac.jssvc.edu.cn:8080/opac/{item}'.format(item=title['href'])
        code = h3.find(text=True, recursive=False).strip()
        author, publisher = list((map(lambda r: r.strip(), book.find('p').text.split('\n')[3:5])))

        return {
            'title': title.text.split('.')[1],
            'link': link,
            'code': code,
            'author': author,
            'publisher': publisher
        }

    def do (self, sKey, sType="title"):
        """
        请求获取图书信息的 API
        """
        searchs = '?strSearchType={sType}&strText={sKey}&doctype=ALL'.format(
            sType = sType, sKey = sKey)
        self.request(searchs)

        return self.toDict(self.books)

    def request(self, searchs):
        """
        实际调用查询接口
        """
        url = '{api}{searchs}'.format(api = self.api, searchs = searchs)
        r = requests.get(url, headers = self.headers, cookies = self.cookies)
        self.cookies = r.cookies
        self.parse(r.text)

    def parse (self, content):
        """
        解析返回的页面，获取图书条目
        """
        soup = BeautifulSoup(content, 'html5lib')
        bookContent = soup.find('div', {'id': 'book_content'})

        # 获取下一跳的连接
        btn = bookContent.find('div', {'class': 'num'}).findAll('a')
        if len(self.books) is 0:
            searchs = btn[0]['href']
        else:
            try:
                searchs = btn[1]['href']
            except IndexError:
                searchs = None

        # 获取图书条目，放入容器
        books = bookContent.findAll('div', {'class': 'list_books'})
        self.books.extend(books)

        # 如果有下一跳地址，继续请求
        if searchs:
            print('>>> next...')
            sleep(1.1) # 避免过快请求，导致问题
            self.request(searchs)

if __name__ == '__main__':

    # 解析命令行
    parser = ArgumentParser(description="Get the books from the library.")
    parser.add_argument('-k', dest="keyword", required=True,
                        help='keyword')
    parser.add_argument('-o', dest="output_filename",
                        default="books.txt",
                        help='output the file [default: books.txt]')
    args = parser.parse_args()

    search = Search()
    books = search.do(args.keyword)

    # 写入文件
    filename = args.output_filename
    with open(filename, 'w') as f:
        content = '\n'.join(map(lambda book: '{code}\t{title}\t{author}\t{publisher}\t{link}'.format(**book), books))
        f.write(content)
        print('DONE. Save in {filename}'.format(filename=filename))
	#!/usr/local/bin/python
	# -- coding: utf-8 --

	from time import sleep
	from argparse import ArgumentParser
	import requests
	from bs4 import BeautifulSoup
	from bs4.element import Tag

	class Search(object):
	"""
	Search the books
	"""
	api = "http://opac.jssvc.edu.cn:8080/opac/openlink.php"
	headers = {
	'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'
	}
	cookies = None
	books = []

	def __init__(self):
	"""
	Init Search
	"""
	pass

	def toDict(self, books = []):
	"""
	将 bs4.element.Tag 类型装换成字典类型
	"""
	return list(map(self.getInfo, books))

	def getInfo(self, book):
	"""
	从 bs4.element.Tag 中提取出书的信息
	"""
	children = book.findChildren()
	h3 = children[0]

	title = h3.find('a')
	link = 'http://opac.jssvc.edu.cn:8080/opac/{item}'.format(item=title['href'])
	code = h3.find(text=True, recursive=False).strip()
	author, publisher = list((map(lambda r: r.strip(), book.find('p').text.split('\n')[3:5])))

	return {
	'title': title.text.split('.')[1],
	'link': link,
	'code': code,
	'author': author,
	'publisher': publisher
	}

	def do (self, sKey, sType="title"):
	"""
	请求获取图书信息的 API
	"""
	searchs = '?strSearchType={sType}&strText={sKey}&doctype=ALL'.format(
	sType = sType, sKey = sKey)
	self.request(searchs)

	return self.toDict(self.books)

	def request(self, searchs):
	"""
	实际调用查询接口
	"""
	url = '{api}{searchs}'.format(api = self.api, searchs = searchs)
	r = requests.get(url, headers = self.headers, cookies = self.cookies)
	self.cookies = r.cookies
	self.parse(r.text)

	def parse (self, content):
	"""
	解析返回的页面，获取图书条目
	"""
	soup = BeautifulSoup(content, 'html5lib')
	bookContent = soup.find('div', {'id': 'book_content'})

	# 获取下一跳的连接
	btn = bookContent.find('div', {'class': 'num'}).findAll('a')
	if len(self.books) is 0:
	searchs = btn[0]['href']
	else:
	try:
	searchs = btn[1]['href']
	except IndexError:
	searchs = None

	# 获取图书条目，放入容器
	books = bookContent.findAll('div', {'class': 'list_books'})
	self.books.extend(books)

	# 如果有下一跳地址，继续请求
	if searchs:
	print('>>> next...')
	sleep(1.1) # 避免过快请求，导致问题
	self.request(searchs)

	if __name__ == '__main__':

	# 解析命令行
	parser = ArgumentParser(description="Get the books from the library.")
	parser.add_argument('-k', dest="keyword", required=True,
	help='keyword')
	parser.add_argument('-o', dest="output_filename",
	default="books.txt",
	help='output the file [default: books.txt]')
	args = parser.parse_args()

	search = Search()
	books = search.do(args.keyword)

	# 写入文件
	filename = args.output_filename
	with open(filename, 'w') as f:
	content = '\n'.join(map(lambda book: '{code}\t{title}\t{author}\t{publisher}\t{link}'.format(**book), books))
	f.write(content)
	print('DONE. Save in {filename}'.format(filename=filename))