Skip to content

Instantly share code, notes, and snippets.

@creamidea
Created March 12, 2017 07:47
Show Gist options
  • Save creamidea/1a946cbe07f9a523e6d861171e3c2813 to your computer and use it in GitHub Desktop.
Save creamidea/1a946cbe07f9a523e6d861171e3c2813 to your computer and use it in GitHub Desktop.
获取苏州职业大学图书馆书籍信息(DEMO)
#!/usr/local/bin/python
# -*- coding: utf-8 -*-
from time import sleep
from argparse import ArgumentParser
import requests
from bs4 import BeautifulSoup
from bs4.element import Tag
class Search(object):
"""
Search the books
"""
api = "http://opac.jssvc.edu.cn:8080/opac/openlink.php"
headers = {
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'
}
cookies = None
books = []
def __init__(self):
"""
Init Search
"""
pass
def toDict(self, books = []):
"""
将 bs4.element.Tag 类型装换成 字典 类型
"""
return list(map(self.getInfo, books))
def getInfo(self, book):
"""
从 bs4.element.Tag 中提取出 书 的信息
"""
children = book.findChildren()
h3 = children[0]
title = h3.find('a')
link = 'http://opac.jssvc.edu.cn:8080/opac/{item}'.format(item=title['href'])
code = h3.find(text=True, recursive=False).strip()
author, publisher = list((map(lambda r: r.strip(), book.find('p').text.split('\n')[3:5])))
return {
'title': title.text.split('.')[1],
'link': link,
'code': code,
'author': author,
'publisher': publisher
}
def do (self, sKey, sType="title"):
"""
请求获取图书信息的 API
"""
searchs = '?strSearchType={sType}&strText={sKey}&doctype=ALL'.format(
sType = sType, sKey = sKey)
self.request(searchs)
return self.toDict(self.books)
def request(self, searchs):
"""
实际调用查询接口
"""
url = '{api}{searchs}'.format(api = self.api, searchs = searchs)
r = requests.get(url, headers = self.headers, cookies = self.cookies)
self.cookies = r.cookies
self.parse(r.text)
def parse (self, content):
"""
解析返回的页面,获取图书条目
"""
soup = BeautifulSoup(content, 'html5lib')
bookContent = soup.find('div', {'id': 'book_content'})
# 获取下一跳的连接
btn = bookContent.find('div', {'class': 'num'}).findAll('a')
if len(self.books) is 0:
searchs = btn[0]['href']
else:
try:
searchs = btn[1]['href']
except IndexError:
searchs = None
# 获取图书条目,放入容器
books = bookContent.findAll('div', {'class': 'list_books'})
self.books.extend(books)
# 如果有下一跳地址,继续请求
if searchs:
print('>>> next...')
sleep(1.1) # 避免过快请求,导致问题
self.request(searchs)
if __name__ == '__main__':
# 解析命令行
parser = ArgumentParser(description="Get the books from the library.")
parser.add_argument('-k', dest="keyword", required=True,
help='keyword')
parser.add_argument('-o', dest="output_filename",
default="books.txt",
help='output the file [default: books.txt]')
args = parser.parse_args()
search = Search()
books = search.do(args.keyword)
# 写入文件
filename = args.output_filename
with open(filename, 'w') as f:
content = '\n'.join(map(lambda book: '{code}\t{title}\t{author}\t{publisher}\t{link}'.format(**book), books))
f.write(content)
print('DONE. Save in {filename}'.format(filename=filename))
@creamidea
Copy link
Author

creamidea commented Mar 12, 2017

一个信息的 HTML 结构
image

@creamidea
Copy link
Author

$ ./request-jssvc-lib.py -k python
>>> next...
>>> next...
DONE. Save in books.txt

之后,查看文件 books.txt 即可。或者导入 excel。

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment