skt041959/cltxt.py

## cltxt.py
#!/usr/bin/env python
# encoding: utf-8

import sys
import requests
from lxml import html
import re
import html2text

proxy = {"http" : "http://127.0.0.1:8123"}
cookie = {"ismob" : "1"}

ht = html2text.HTML2Text()
ht.ignore_links = True

url = sys.argv[1]

m = re.search('/\d+.html', url)
tid = url[m.start()+1:m.end()-5]

r = requests.get(url, proxies=proxy, cookies = cookie)

content = r.content.decode("cp936")

h = html.fromstring(content)

title = h.xpath('/html/head/title/text()')
urls = h.xpath('/html/body/div/div/table/tr/td/div/a/@href')
print(urls[-1])

m = re.search('page=\\d+', urls[-1])

try:
    page_number = int(sys.argv[2])
except IndexError:
    page_number = int(urls[-1][m.start()+5:m.end()])

text = ht.handle(content)

for i in range(2, page_number):
    url = 'http://t66y.com/read.php?tid={}&page={}'.format(tid, i)
    r = requests.get(url, proxies=proxy, cookies = cookie)
    content = r.content.decode("utf8")
    text = text + ht.handle(content)

    print(i)

with open("{}_{}.txt".format(title[0], tid), "w") as f:
    f.write(text)
	#!/usr/bin/env python
	# encoding: utf-8

	import sys
	import requests
	from lxml import html
	import re
	import html2text

	proxy = {"http" : "http://127.0.0.1:8123"}
	cookie = {"ismob" : "1"}

	ht = html2text.HTML2Text()
	ht.ignore_links = True

	url = sys.argv[1]

	m = re.search('/\d+.html', url)
	tid = url[m.start()+1:m.end()-5]

	r = requests.get(url, proxies=proxy, cookies = cookie)

	content = r.content.decode("cp936")

	h = html.fromstring(content)

	title = h.xpath('/html/head/title/text()')
	urls = h.xpath('/html/body/div/div/table/tr/td/div/a/@href')
	print(urls[-1])

	m = re.search('page=\\d+', urls[-1])

	try:
	page_number = int(sys.argv[2])
	except IndexError:
	page_number = int(urls[-1][m.start()+5:m.end()])

	text = ht.handle(content)

	for i in range(2, page_number):
	url = 'http://t66y.com/read.php?tid={}&page={}'.format(tid, i)
	r = requests.get(url, proxies=proxy, cookies = cookie)
	content = r.content.decode("utf8")
	text = text + ht.handle(content)

	print(i)

	with open("{}_{}.txt".format(title[0], tid), "w") as f:
	f.write(text)