Skip to content

Instantly share code, notes, and snippets.

@skt041959
Created December 1, 2015 15:55
Show Gist options
  • Save skt041959/011c6fc5222d43430066 to your computer and use it in GitHub Desktop.
Save skt041959/011c6fc5222d43430066 to your computer and use it in GitHub Desktop.
download novel from 1024
#!/usr/bin/env python
# encoding: utf-8
import sys
import requests
from lxml import html
import re
import html2text
proxy = {"http" : "http://127.0.0.1:8123"}
cookie = {"ismob" : "1"}
ht = html2text.HTML2Text()
ht.ignore_links = True
url = sys.argv[1]
m = re.search('/\d+.html', url)
tid = url[m.start()+1:m.end()-5]
r = requests.get(url, proxies=proxy, cookies = cookie)
content = r.content.decode("cp936")
h = html.fromstring(content)
title = h.xpath('/html/head/title/text()')
urls = h.xpath('/html/body/div/div/table/tr/td/div/a/@href')
print(urls[-1])
m = re.search('page=\\d+', urls[-1])
try:
page_number = int(sys.argv[2])
except IndexError:
page_number = int(urls[-1][m.start()+5:m.end()])
text = ht.handle(content)
for i in range(2, page_number):
url = 'http://t66y.com/read.php?tid={}&page={}'.format(tid, i)
r = requests.get(url, proxies=proxy, cookies = cookie)
content = r.content.decode("utf8")
text = text + ht.handle(content)
print(i)
with open("{}_{}.txt".format(title[0], tid), "w") as f:
f.write(text)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment