Skip to content

Instantly share code, notes, and snippets.

@jayzhan211
Created November 18, 2018 03:42
Show Gist options
  • Save jayzhan211/2aa2b381dff32c8b87d84873453f5c37 to your computer and use it in GitHub Desktop.
Save jayzhan211/2aa2b381dff32c8b87d84873453f5c37 to your computer and use it in GitHub Desktop.
import requests
import re
import os
import sys
import time
from bs4 import BeautifulSoup
load_path = r'C:\Users\owner\Wenku8'
headers = {
'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36',
'Cookie':'Hm_lvt_d72896ddbf8d27c750e3b365ea2fc902=1542500172,1542500185; UM_distinctid=167242d0e1d578-0a974a88240399-3a3a5c0e-1fa400-167242d0e1e4; jieqiUserCharset=big5; PHPSESSID=cd1k29vc75laat22jq0p7sooca705gd8; jieqiUserInfo=jieqiUserId%3D416755%2CjieqiUserName%3Djiburiru%2CjieqiUserGroup%3D3%2CjieqiUserVip%3D0%2CjieqiUserPassword%3D1d95f663dfa0ce76562ee8ca0ea90089%2CjieqiUserName_un%3Djiburiru%2CjieqiUserHonor_un%3D%26%23x65B0%3B%26%23x624B%3B%26%23x4E0A%3B%26%23x8DEF%3B%2CjieqiUserGroupName_un%3D%26%23x666E%3B%26%23x901A%3B%26%23x4F1A%3B%26%23x5458%3B%2CjieqiUserLogin%3D1542505405; jieqiVisitInfo=jieqiUserLogin%3D1542505405%2CjieqiUserId%3D416755; CNZZDATA1309966=cnzz_eid%3D2004785943-1542499396-%26ntime%3D1542501320; Hm_lpvt_d72896ddbf8d27c750e3b365ea2fc902=1542505409; CNZZDATA1259916661=2097782824-1542499949-%7C1542501702'
}
url = 'https://www.wenku8.net/modules/article/articlelist.php?class='
resp=requests.get(url=url,headers=headers)
resp.encoding='big5'
#print(resp.text)
soup=BeautifulSoup(resp.text,'lxml')
for div in soup.find_all('div',{'style':'width:95px;float:left;'}):
for a in div.find_all('a'):
title=a.get('title')
url=a.get('href')
id = re.sub('[^0-9]','',url)[1:]
book_url='https://www.wenku8.net/modules/article/reader.php?aid='+str(id)
print('{}: {}'.format(title,book_url))
if not os.path.exists(os.path.join(load_path, title)):
os.makedirs(os.path.join(load_path, title))
os.chdir(os.path.join(load_path, title))
reps2=requests.get(url=book_url,headers=headers)
reps2.encoding='big5'
soup = BeautifulSoup(reps2.text,'lxml')
#print(soup)
for td in soup.find_all('td',{'class':'ccss'}):
for a in td.find_all('a'):
print(a)
url=a.get('href')
title2=a.text
print('{}: {}'.format(title2,url))
if not os.path.exists(os.path.join(load_path, title, title2)):
os.makedirs(os.path.join(load_path, title, title2))
os.chdir(os.path.join(load_path, title, title2))
reps3=requests.get(url=url,headers=headers)
reps3.encoding='big5'
soup = BeautifulSoup(reps3.text,'lxml')
for line in soup.find_all('div',{'id':'content'}):
print(line.text)
with open(title2+'.txt',"w") as f:
f.write(line.text.encode(sys.stdin.encoding, "replace").decode(sys.stdin.encoding))
print('OK!!!')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment