Skip to content

Instantly share code, notes, and snippets.

@jason-xuan
Last active July 21, 2017 02:47
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jason-xuan/094a6049ebe6cb1f2f99be5b7fbb8035 to your computer and use it in GitHub Desktop.
Save jason-xuan/094a6049ebe6cb1f2f99be5b7fbb8035 to your computer and use it in GitHub Desktop.
下载某小说网站的小说的简单代码(单线程)
import requests
from re import compile
from tqdm import tqdm
from bs4 import BeautifulSoup
root_url = 'http://www.biquzi.com'
def get_urls_from_main_page(url) -> list:
result = requests.get(url)
result.encoding = 'gbk'
text = result.text
soup = BeautifulSoup(text, 'lxml')
urls = [a['href'] for a in soup.find_all('a')]
number = [x for x in url.split('/') if x != ''][-1]
model = compile(f'/{number}/')
urls = [ root_url + url for url in urls if model.match(url)]
return urls
def process_page(url) -> str:
result = requests.get(url)
result.encoding = 'gbk'
text = result.text
soup = BeautifulSoup(text, 'lxml')
title = soup.find('div', 'bookname').h1.text
strings = [string.replace('\xa0', '') for string in soup.find(id='content').strings]
content = "".join(strings)
result = title + '\n' + content + '\n'
return result
def download(url, path):
urls = get_urls_from_main_page(url)
with open(path ,'w') as f:
for url in tqdm(urls):
# keep trying
while True:
try:
text = process_page(url)
f.write(text)
break
except Exception as e:
print(e)
if __name__ == '__main__':
url = 'http://www.biquzi.com/0_32/'
path = '全职高手.txt'
download(url, path)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment