Last active
February 21, 2017 10:26
-
-
Save qzane/d9b0772f2fecda112b395f04d8057b23 to your computer and use it in GitHub Desktop.
download ebooks from www.bookbao8.com
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#coding: utf-8 | |
''' download ebooks from http://www.bookbao8.com/ ''' | |
import requests | |
from lxml import etree | |
import sys | |
from multiprocessing import Pool | |
URL = r'http://www.bookbao8.com/views/200910/06/id_XMjg4NDg=_{}.html' | |
URLs = [URL.format(i) for i in range(1,1+23)] | |
WORKER = 5 # speed up by setting a larger number | |
def clean(text): | |
tree = etree.HTML(text) | |
texts = tree.xpath(r'//dd[@id="contents"]/text()') | |
text = ''.join(texts).replace('\r','\n') | |
return text | |
def from_url(url): | |
print(url) | |
sys.stdout.flush() | |
raw = requests.get(url).content.decode('gbk').replace(r'<br/>','\n') | |
text = clean(raw) | |
return text | |
def main(): | |
workers = Pool(WORKER) # speed up by setting a larger number | |
data = workers.map(from_url, URLs) | |
with open('book.txt','wb') as f: | |
f.write(''.join(data).encode('utf-8')) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment