Skip to content

Instantly share code, notes, and snippets.

@chengjun
Last active October 2, 2015 02:45
Show Gist options
  • Save chengjun/3fbc39690cc5207e8916 to your computer and use it in GitHub Desktop.
Save chengjun/3fbc39690cc5207e8916 to your computer and use it in GitHub Desktop.
import urllib2
from bs4 import BeautifulSoup
import sys
# get the link for each chapter
url = "http://www.23wx.com/html/50/50550/" # 三界独尊
content = urllib2.urlopen(url).read()
soup = BeautifulSoup(content)
links = soup.find_all('td')[1000:]
link = []
for i in links:
try:
link.append(url+i.a['href'])
except:
pass
# define a crawler
def crawler(url):
content = urllib2.urlopen(url).read() #获取网
soup = BeautifulSoup(content)
txt = soup.find_all('dd')
page= ''
for i in txt:
page += i.get_text().encode('utf8')+'\n'
return page
def flushPrint(variable):
sys.stdout.write('\r')
sys.stdout.write('%s' % variable)
sys.stdout.flush()
# start to scrape
for i in link:
flushPrint(i)
page = crawler(i)
with open('/Users/chengjun/bigdata/three-worlds.txt', 'a') as f:
f.write(page + '\n')
@chengjun
Copy link
Author

chengjun commented Oct 2, 2015

三界独尊

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment