Last active
December 21, 2023 08:49
-
-
Save jackeyGao/d73381087b1278177aab60636f635119 to your computer and use it in GitHub Desktop.
全宋词爬虫解析脚本
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
''' | |
File Name: parser.py | |
Author: JackeyGao | |
mail: gaojunqi@outlook.com | |
''' | |
import sys | |
import random | |
import time | |
import requests | |
import re | |
from parsel import Selector | |
from peewee import IntegrityError | |
from db import Ci | |
from db import CiAuthor | |
header = { | |
"Connection": "keep-alive", | |
"Origin": "http://qsc.zww.cn", | |
"Upgrade-Insecure-Requests": "1", | |
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36", | |
"Content-Type": "application/x-www-form-urlencoded", | |
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", | |
"Referer": "http://qsc.zww.cn/", | |
"Accept-Encoding": "gzip, deflate", | |
"Accept-Language": "zh-CN,zh;q=0.8,en;q=0.6,zh-TW;q=0.4", | |
"Cookie": "Hm_lvt_12506b8a4147836b0046047de09b2a2e=1493688567; _D_SID=92CED13DD066A18AEC64F1086BA2B715; ASPSESSIONIDSABSRATC=OOFAEFEAJAGIAIEMGGAEDBNL; UM_distinctid=15c6821bb13453-0fd27be8dc79a5-30657509-13c680-15c6821bb14468; CNZZDATA618132=cnzz_eid%3D761011847-1496395659-null%26ntime%3D1496395659" | |
} | |
reload(sys) | |
sys.setdefaultencoding('utf-8') | |
seek_patt = re.compile(r"\((.*?)\)", re.I|re.X) | |
# --------------------------------- | |
class QTSBase(object): | |
def filllist(self, content): | |
self.content = content | |
def fillpage(self, fillpage): | |
self.page = fillpage | |
def fillbody(self, content): | |
self.content = content | |
class ParentBase(object): | |
def __init__(self): | |
self.QTS = QTSBase() | |
# ---------------------------------- | |
parent = ParentBase() | |
exec("parent.QTS.fillpage('第1页 共92页 1564条')") | |
def __with_seek_type__(seek_type): | |
def request(pageno, value=''): | |
url = 'http://qsc.zww.cn/getdata.asp' | |
payload = { | |
'seektype': seek_type, | |
'seekvalue': value, | |
'pageno': int(pageno) | |
} | |
resp = requests.post( | |
url, | |
data=payload, | |
headers=header | |
) | |
return resp | |
return request | |
def parse(html, callback, *args, **kwargs): | |
html = html.decode('utf8') | |
html = html.encode('latin1') | |
html = html.decode('gb2312', 'ignore') | |
sel = Selector(text=html) | |
return callback(sel, *args, **kwargs) | |
def callback_author_list(sel, *args, **kwargs): | |
data = sel.xpath('//script').extract()[0] | |
for l in data.splitlines(): | |
if not l.startswith('parent.QTS.filllist'): | |
continue | |
exec(l) | |
sel = Selector( | |
text=unicode(parent.QTS.content) | |
) | |
for i in sel.xpath('//a'): | |
seek = i.xpath('@onclick').extract()[0] | |
seek = seek_patt.findall(seek)[0] | |
_type, value, pageno = seek.split(',') | |
text = i.xpath('text()').extract()[0] | |
if _type != '10': | |
continue | |
name = text.replace('…', '') | |
# save author to database. | |
try: | |
CiAuthor.create( | |
value = value, | |
name = name | |
) | |
print("主键%s, 已创建." % value) | |
except IntegrityError: | |
print("重复主键%s, 已跳过." % value) | |
def callback_author_info(sel, *args, **kwargs): | |
data = sel.xpath('//script').extract()[0] | |
for l in data.splitlines(): | |
if not l.startswith('parent.QTS.fillbody'): | |
continue | |
exec(l) | |
sel = Selector( | |
text=unicode(parent.QTS.content) | |
) | |
ds = sel.xpath('//text()').extract() | |
name = sel.xpath('//text()').extract()[1] | |
lon = ''.join([s.strip() for s in ds[5:]]).strip() | |
author = kwargs["author"] | |
author.long_desc = lon | |
#author.short_desc = sht | |
author.save() | |
print("主键%s(%s), 已更新" % (author.value, author.name)) | |
return sel | |
return sel | |
def callback_ci_info(sel, *args, **kwargs): | |
data = sel.xpath('//script').extract()[0] | |
for l in data.splitlines(): | |
if not l.startswith('parent.QTS.fillbody'): | |
continue | |
if '宋体' in l: | |
continue | |
exec(l) | |
sel = Selector( | |
text=unicode(parent.QTS.content) | |
) | |
value = kwargs["seekid"] | |
rhythmic = sel.xpath('//b/text()').extract()[0] | |
author = sel.xpath('//text()').extract()[1] | |
contents = sel.xpath('//text()').extract()[2:] | |
content = '\n'.join(contents) | |
try: | |
Ci.create( | |
value = value, | |
rhythmic = rhythmic, | |
author = author, | |
content = content | |
) | |
print("主键%s, 已创建." % value) | |
except IntegrityError: | |
Ci.update( | |
rhythmic = rhythmic, | |
author = author, | |
content = content | |
).where( | |
Ci.value == value | |
).execute() | |
print("重复主键%s, 已更新." % value) | |
return sel | |
f_author_list = __with_seek_type__(1) | |
f_author_info = __with_seek_type__(10) | |
f_ci_list = __with_seek_type__(5) | |
f_ci_info = __with_seek_type__(9) | |
#resp = f_ci_info(1, value=1460) | |
#sel = parse(resp.text, callback_ci_info, seekid=1) | |
if __name__ == '__main__': | |
for p in range(1, 93): | |
resp = f_author_list(p, value=1) | |
sel = parse(resp.text, callback_author_list) | |
# crawl author info | |
for i in CiAuthor.select().where(CiAuthor.value>0): | |
resp = f_author_info(1, value=i.value) | |
sel = parse(resp.text, callback_author_info, author=i) | |
# crawl author ci list | |
for i in range(1, 21051): | |
try: | |
resp = f_ci_info(1, value=i) | |
except requests.exceptions.ConnectionError as e: | |
wait_seconds = random.choice(range(1, 10)) | |
print("等待%s..异常(%s)" % (wait_seconds, str(e))) | |
time.sleep(wait_seconds) | |
continue | |
sel = parse(resp.text, callback_ci_info, seekid=i) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
python 3 里,字符处理不了吧。 encode 和 unicode 函数不好用。