chengjun/顶点小说下载Python版本.py

## 顶点小说下载Python版本.py
import urllib2
from bs4 import BeautifulSoup
import sys

# get the link for each chapter
url = "http://www.23wx.com/html/50/50550/" # 三界独尊
content = urllib2.urlopen(url).read()
soup = BeautifulSoup(content)
links = soup.find_all('td')[1000:]

link = []
for i in links:
    try:
        link.append(url+i.a['href'])
    except:
        pass
# define a crawler
def crawler(url):
    content = urllib2.urlopen(url).read() #获取网
    soup = BeautifulSoup(content)
    txt = soup.find_all('dd')
    page= ''
    for i in txt:
        page += i.get_text().encode('utf8')+'\n'
    return page

def flushPrint(variable):
    sys.stdout.write('\r')
    sys.stdout.write('%s' % variable)
    sys.stdout.flush()

# start to scrape
for i in link:
    flushPrint(i)
    page = crawler(i)
    with open('/Users/chengjun/bigdata/three-worlds.txt', 'a') as f:
        f.write(page + '\n')
	import urllib2
	from bs4 import BeautifulSoup
	import sys

	# get the link for each chapter
	url = "http://www.23wx.com/html/50/50550/" # 三界独尊
	content = urllib2.urlopen(url).read()
	soup = BeautifulSoup(content)
	links = soup.find_all('td')[1000:]

	link = []
	for i in links:
	try:
	link.append(url+i.a['href'])
	except:
	pass
	# define a crawler
	def crawler(url):
	content = urllib2.urlopen(url).read() #获取网
	soup = BeautifulSoup(content)
	txt = soup.find_all('dd')
	page= ''
	for i in txt:
	page += i.get_text().encode('utf8')+'\n'
	return page

	def flushPrint(variable):
	sys.stdout.write('\r')
	sys.stdout.write('%s' % variable)
	sys.stdout.flush()

	# start to scrape
	for i in link:
	flushPrint(i)
	page = crawler(i)
	with open('/Users/chengjun/bigdata/three-worlds.txt', 'a') as f:
	f.write(page + '\n')