Created June 5, 2014 03:05
My web crawl
# _*_ coding: utf-8 _*_
import sys
from gevent import monkey
import requests
import redis
import gevent
from gevent.pool import Pool
from bs4 import BeautifulSoup
import time
from pymongo import MongoClient, ReadPreference
import json
import redis.connection
redis.connection.socket = gevent.socket
mongo_connection = MongoClient(
'%s:%d' % (
max_pool_size=10, use_greenlets=True)
mongo_db = mongo_connection.jobdigg
redis_connection = redis.ConnectionPool(
redis_proxy_pool = redis.ConnectionPool(
proxy_pool = []
pool_num = 100
header = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Encoding": "gzip,deflate,sdch",
"Accept-Language": "zh-CN,zh;q=0.8",
"Cache-Control": "max-age=0",
"Connection": "keep-alive",
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.114 Safari/537.36"
def WYUrlGenerator():
print '51 Dig start : the url...'
start = time.time()
redis_db = redis.Redis(connection_pool=redis_connection)
urllist = WYJobUrlYield()
gpool = Pool(pool_num)
for uargs in urllist:
gpool.spawn(GenerateUrl, uargs)
# 从这里开始,循环的从错误url集合里面取url,直至取完所有的
length = redis_db.scard("error_url_list")
while length > 0:
errorlist = ErrorUrlGenerator()
epool = Pool(pool_num)
for url in errorlist:
epool.spawn(GenerateUrl, url)
length = redis_db.scard("error_url_list")
end = time.time()
print 'dig end : the url...all spend time is %0.2f' % (end - start)
def WYJobUrlYield():
for page in xrange(3000):
page += 1
url = "http://some.crawl.url with page num %s" % page
jobitem = {
"url": url,
"type": "jobtype"
jobvalue = json.dumps(jobitem)
yield jobvalue
#从错误url的集合里面取出url 再次处理
def ErrorUrlGenerator():
redis_db = redis.Redis(connection_pool=redis_connection)
urllist = redis_db.smembers("error_url_list")
for url in urllist:
yield url
def GenerateUrl(sourcejob):
redis_db = redis.StrictRedis(connection_pool=redis_connection)
pipe = redis_db.pipeline()
newitem = json.loads(sourcejob)
url = newitem["url"]
urltype = newitem["type"]
ip = proxy_pool.getProxy()
proxy = {"http": "http://"+ip["proxy"]}
timeout = gevent.Timeout(5, ConnectionError)#在这里设置超时,保证每次请求不会太长,阻塞后面的请求,超过5秒抛出错误
r = requests.get(url, headers=header, proxies=proxy)
jobs = BeautifulSoup(r.text)
if urltype == "urltype": #获取页面的所有url,然后保存到redis的一个set里面
results = jobs.findAll("a", {"class": "classname"})
for result in results:
url = result["href"]
urlitem = {
"url": url,
"type": "urltype"
urlvalue = json.dumps(urlitem)
pipe.sadd("url_list", urlitem) # 这里将获取的url保存至url_list 这个redis集合里面
pipe.srem("error_url_list", sourcejob) #运行到这里,说明当前url如果是错误的url,那么就已经被处理,在这里删除掉
except Exception as e:
error_name = e.__class__.__name__
if error_name == "ConnectionError" or error_name == "ProxyError": #通过判断错误类型(因为一些链接或者代理错误,我会直接扔回专门保存错误url的集合里面,然后下次再去取出来处理)
redis_db.sadd('error_url_list', sourcejob)
# ConnectionError
# <timer at 0x36c8c80 callback=<bound method Greenlet.throw of <Greenlet at 0xc844050>> args=(<class 'requests.exceptions.ConnectionError'>,)> failed with ConnectionError
# Traceback (most recent call last):
# File "/data/home/zp/ZP_VIRENV02/lib/python2.7/site-packages/gevent/", line 327, in run
# result = self._run(*self.args, **self.kwargs)
# File "", line 147, in GenerateUrl
# redis_db.sadd('error_url_list', sourcejob)
# File "/data/home/zp/ZP_VIRENV02/lib/python2.7/site-packages/redis/", line 1248, in sadd
# return self.execute_command('SADD', name, *values)
# File "/data/home/zp/ZP_VIRENV02/lib/python2.7/site-packages/redis/", line 461, in execute_command
# return self.parse_response(connection, command_name, **options)
# File "/data/home/zp/ZP_VIRENV02/lib/python2.7/site-packages/redis/", line 471, in parse_response
# response = connection.read_response()
# File "/data/home/zp/ZP_VIRENV02/lib/python2.7/site-packages/redis/", line 339, in read_response
# response = self._parser.read_response()
# File "/data/home/zp/ZP_VIRENV02/lib/python2.7/site-packages/redis/", line 110, in read_response
# response =
# File "/data/home/zp/ZP_VIRENV02/lib/python2.7/site-packages/redis/", line 103, in read
# return self._fp.readline()[:-2]
# File "/usr/local/lib/python2.7/", line 447, in readline
# data = self._sock.recv(self._rbufsize)
# File "/data/home/zp/ZP_VIRENV02/lib/python2.7/site-packages/gevent/", line 392, in recv
# self._wait(self._read_event)
# File "/data/home/zp/ZP_VIRENV02/lib/python2.7/site-packages/gevent/", line 298, in _wait
# self.hub.wait(watcher)
# File "/data/home/zp/ZP_VIRENV02/lib/python2.7/site-packages/gevent/", line 341, in wait
# result = waiter.get()
# File "/data/home/zp/ZP_VIRENV02/lib/python2.7/site-packages/gevent/", line 568, in get
# return self.hub.switch()
# File "/data/home/zp/ZP_VIRENV02/lib/python2.7/site-packages/gevent/", line 331, in switch
# return greenlet.switch(self)
if __name__ == '__main__':
st = time.time()
et = time.time()
print "**************end****************,the spend time is %0.2f" % (et - st)
