Skip to content

Instantly share code, notes, and snippets.

@qzane
Last active February 21, 2017 10:26
Show Gist options
  • Save qzane/d9b0772f2fecda112b395f04d8057b23 to your computer and use it in GitHub Desktop.
Save qzane/d9b0772f2fecda112b395f04d8057b23 to your computer and use it in GitHub Desktop.
download ebooks from www.bookbao8.com
#coding: utf-8
''' download ebooks from http://www.bookbao8.com/ '''
import requests
from lxml import etree
import sys
from multiprocessing import Pool
URL = r'http://www.bookbao8.com/views/200910/06/id_XMjg4NDg=_{}.html'
URLs = [URL.format(i) for i in range(1,1+23)]
WORKER = 5 # speed up by setting a larger number
def clean(text):
tree = etree.HTML(text)
texts = tree.xpath(r'//dd[@id="contents"]/text()')
text = ''.join(texts).replace('\r','\n')
return text
def from_url(url):
print(url)
sys.stdout.flush()
raw = requests.get(url).content.decode('gbk').replace(r'<br/>','\n')
text = clean(raw)
return text
def main():
workers = Pool(WORKER) # speed up by setting a larger number
data = workers.map(from_url, URLs)
with open('book.txt','wb') as f:
f.write(''.join(data).encode('utf-8'))
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment