Skip to content

Instantly share code, notes, and snippets.

@1oo1
Last active October 6, 2016 14:55
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save 1oo1/40fc5bb84eaec15fd3471e2fd777b515 to your computer and use it in GitHub Desktop.
Save 1oo1/40fc5bb84eaec15fd3471e2fd777b515 to your computer and use it in GitHub Desktop.
#! /usr/bin/env python
#coding:utf-8
from bs4 import BeautifulSoup
import requests
import re
import time
from multiprocessing.dummy import Pool as ThreadPool
class Proxy(object):
def __init__(self, max_page=1):
self.timestamp = time.time()
self.max_page = max_page
self.proxies = []
self.checked_proxies = []
self.s = requests.Session()
self.headers = {
'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding' :'gzip, deflate, sdch',
'Accept-Language' : 'zh-CN,zh;q=0.8,en;q=0.6,it;q=0.4,zh-TW;q=0.2',
'Connection' : 'keep-alive',
'Host' : 'www.xicidaili.com',
'Upgrade-Insecure-Requests': '1',
'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'
}
self.s.headers.update(self.headers)
self.url = 'http://www.xicidaili.com/nn/'
def _parse_proxy(self):
res = self.s.get(self.url)
try:
soup = BeautifulSoup(res.content, 'html.parser')
tr_tags = soup.find('table', {'id' : 'ip_list'}).find_all('tr')
for tr in tr_tags:
tds = tr.find_all('td')
if not tds: # ignore title
continue
time_tag = tds[6].find('div', {'class' : 'bar'}, title=True)
if time_tag:
time = re.search('\d+.\d+', str(time_tag)).group()
if float(time) > 4:
continue
ip = tds[1].text
port = tds[2].text
schema = tds[5].text
# conn_time =
if schema == 'HTTPS':
self.proxies.append({'https' : ip + ':' + port})
else:
self.proxies.append({'http' : ip + ':' + port})
except Exception as e:
print(u'代理获取异常:{}'.format(e))
def _check_proxy(self, proxy, anonymous=True):
try:
r = requests.get('http://httpbin.org/ip', proxies=proxy, timeout=3)
data = r.json()
# 高匿检测
if anonymous:
if data['origin'] == proxy.values()[0].split(':')[0]:
self.checked_proxies.append(proxy)
else:
self.checked_proxies.append(proxy)
except Exception as e:
pass
def get_proxy(self):
time_distance = time.time() - self.timestamp
if len(self.checked_proxies) == 0 or time_distance > 60 * 30:
print(u'刷新代理池...')
self.proxies = []
self.checked_proxies = []
self._parse_proxy()
pool = ThreadPool(8)
pool.map(self._check_proxy, self.proxies)
pool.close()
pool.join()
self.timestamp = time.time()
print(u'新的代理池: \n{}\n'.format(self.checked_proxies))
return self.checked_proxies
if __name__ == '__main__':
ins = Proxy()
print ins.get_proxy()
#! /usr/bin/env python
#coding:utf-8
import threading
import sqlite3
import random
import urlparse
import re
import time
from datetime import datetime
from requests import Session
from requests.exceptions import ProxyError, ConnectTimeout
from bs4 import BeautifulSoup
from multiprocessing.dummy import Pool
from auto_proxy import Proxy
class SQLiteHelper:
def __init__(self, db_path):
self.db_path = db_path
def conn_transaction(func):
def connection(self, *args, **kwargs):
conn = sqlite3.connect(self.db_path)
kwargs['conn'] = conn
result = func(self, *args, **kwargs)
conn.close()
return result
return connection
@conn_transaction
def execute(self, command, params=None, conn=None):
cursor = conn.cursor()
result = 0
try:
if params:
cursor.execute(command, params)
else:
cursor.execute(command)
conn.commit()
except Exception as e:
print e
conn.rollback()
result = -1
finally:
cursor.close()
return result
@conn_transaction
def fetch_data(self, command, conn=None):
cursor = conn.cursor()
result = []
try:
cursor.execute(command)
result = cursor.fetchall()
except Exception as e:
print e
finally:
cursor.close()
return result
def format_insert_params(self, column_name_list, params):
result = []
for col in column_name_list:
if col in params:
result.append(params[col])
else:
result.append('')
return tuple(result)
HEADER_UA_LIST = [{'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'},\
{'User-Agent':'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.12 Safari/535.11'},\
{'User-Agent':'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Trident/6.0)'},\
{'User-Agent':'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:34.0) Gecko/20100101 Firefox/34.0'},\
{'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/44.0.2403.89 Chrome/44.0.2403.89 Safari/537.36'},\
{'User-Agent':'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50'},\
{'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50'},\
{'User-Agent':'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0'},\
{'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1'},\
{'User-Agent':'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1'},\
{'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11'},\
{'User-Agent':'Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11'},\
{'User-Agent':'Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11'}]
class Crawler:
def __init__(self):
self.proxies_list = []
def crawl(self, url):
proxy = None
if len(self.proxies_list) > 0:
proxy = self.proxies_list[random.randint(0, len(self.proxies_list) - 1)]
print(u'使用代理 {}'.format(self.timestamp))
try:
res = Session().get(url, headers=HEADER_UA_LIST[random.randint(0, len(HEADER_UA_LIST) - 1)], proxies=proxy, timeout=15)
soup = BeautifulSoup(res.content, 'html.parser')
soup_content = soup.encode_contents(encoding='utf-8')
if soup_content.find(u'您所在的IP流量异常'.encode('utf-8')) != -1:
return -2
except (ProxyError, ConnectTimeout):
self.proxies_list.remove(proxy)
self.crawl(url)
except Exception as e:
print 'Crawl {} error: {}'.format(url, e)
return -1
else:
return self.extract_func(soup, res)
def start_thread_pool(self, func, args, nums=8):
thread_pool = Pool(processes=nums)
thread_pool.map(func, args)
thread_pool.close()
thread_pool.join()
del thread_pool
def extract_func(self, soup, res):
pass # 需要子类实现
@property
def timestamp(self):
return datetime.now().strftime('%Y-%m-%d %H:%M;%S')
class ChengquCrawler(Crawler):
def __init__(self, database, base_url='http://bj.lianjia.com', path='/xiaoqu/'):
Crawler.__init__(self)
self.base_url = base_url
self.path = path
self.database = database
def crawl(self):
result = self.fetchall()
if len(result) == 0:
return Crawler.crawl(self, self.base_url + self.path)
else:
return 0
def extract_func(self, soup, res):
try:
tag = soup.find('div', {'class' : 'position'}).find('a', {'href' : '/xiaoqu/'})
city = tag['title'].replace(u'小区区域', '')
except Exception as e:
print '解析城市失败 {}'.format(e)
return -1
else:
try:
tag_list = soup.find('div', {'data-role' : 'ershoufang'}).find_all('a', href=True)
info_dict = {}
for t in tag_list:
try:
info_dict['url'] = self.base_url + t['href']
info_dict['name'] = t.text
info_dict['city'] = city
info_dict['status'] = 'n'
info_dict['timestamp'] = self.timestamp
except Exception as e:
print u'{} 解析城区失败 e: {}'.format(str(info_dict), e)
continue
else:
params = self.database.format_insert_params(['name', 'url', 'status', 'city', 'timestamp'], info_dict)
self.database.execute('insert into chengqu values(?, ?, ?, ?, ?)', params)
except Exception as e:
print e
return -1
return 0
def fetchall(self):
return self.database.fetch_data('select * from chengqu')
def update_chengqu_crawled(self, url):
return self.database.execute('update chengqu set status=?, timestamp=? where url=?', ('y', self.timestamp, url))
class XiaoquCrawler(Crawler):
def __init__(self, database):
Crawler.__init__(self)
self.database = database
self.failedpage_list = []
self.did_start_thread_pool = False
def crawl(self, url):
code = Crawler.crawl(self, url)
if code == -2 or code == -1:
self.failedpage_list.append(url);
return code
def extract_func(self, soup, res):
try:
if not self.did_start_thread_pool:
self.did_start_thread_pool = True
div = soup.find('div', {'class':'page-box house-lst-page-box'})
page_dic = 'page_dic=' + div['page-data']
exec(page_dic)
total_pages = page_dic['totalPage']
url_need_handle = urlparse.urlparse(res.request.url)[0] + '://' + urlparse.urlparse(res.request.url)[1]
url_need_handle = url_need_handle + div['page-url']
page_urls = []
for i in range(1, total_pages+1):
page_urls.append(url_need_handle.replace('{page}', str(i)))
if len(page_urls) > 0:
self.start_thread_pool(self.crawl, page_urls)
else:
self.extract_xiaoqu(soup)
except Exception as e:
print '解析小区页数失败 e:{}'.format(e)
finally:
return 0
def extract_xiaoqu(self, soup):
xiaoqu_list = soup.find_all('li', {'class' : 'clear'})
for xiaoqu in xiaoqu_list:
info_dict = {}
try:
info_dict['name'] = xiaoqu.find('div', {'class' : 'title'}).find('a').text
info_dict['url'] = xiaoqu.find('div', {'class' : 'title'}).find('a')['href']
try:
info_dict['img'] = xiaoqu.find('img', {'class' : 'lj-lazy'})['src']
info_dict['district'] = xiaoqu.find('div', {'class' : 'positionInfo'}).find('a', {'class' : 'district'}).text
info_dict['bizcircle'] = xiaoqu.find('div', {'class' : 'positionInfo'}).find('a', {'class' : 'bizcircle'}).text
re_string = xiaoqu.find('div', {'class' : 'positionInfo'}).renderContents().strip().decode('utf-8')
re_match = re.split('</a>', re_string)
if len(re_match) > 1:
type_time = re_match[-1].strip().lstrip('/').split('/')
if len(type_time) >= 2:
xiaoqu_type = '/'.join(type_time[0:-1])
xiaoqu_time = type_time[-1].strip()
xiaoqu_time = re.search('\d*', xiaoqu_time).group()
info_dict['type'] = xiaoqu_type
info_dict['time'] = xiaoqu_time
house_info = xiaoqu.find('div', {'class' : 'houseInfo'})
for a_tag in house_info.find_all('a', href=True):
href = a_tag['href']
if re.match('.*/chengjiao/.*', href):
info_dict['cj_url'] = href
elif re.match('.*/zufang/.*', href):
info_dict['zf_url'] = href
except:
print u'{} 某项信息提取失败, e:{}'.format(info_dict['name'])
info_dict['cj_status'] = 'n'
info_dict['zf_status'] = 'n'
info_dict['timestamp'] = self.timestamp
params = self.database.format_insert_params(['url', 'img', 'name', 'district', 'bizcircle', 'type', 'time', 'cj_url', 'zf_url', 'cj_status', 'zf_status', 'timestamp'], info_dict)
self.database.execute('insert into xiaoqu values(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)', params)
except Exception as e:
print(u'提取某个小区失败; e: {}\n {}'.format(e, str(xiaoqu).decode('utf-8')))
continue
def fetchall(self):
'''
(url, cj_url, cj_status, zf_url, zf_status, name)
'''
return self.database.fetch_data('select url, cj_url, cj_status, zf_url, zf_status, name from xiaoqu')
def fetchall_uncrawled_cj(self):
'''
(url, cj_url, cj_status)
'''
return self.database.fetch_data('select url, cj_url, cj_status from xiaoqu where cj_status="n"')
def update_chengjiao_crawled(self, url):
return self.database.execute('update xiaoqu set cj_status=?, timestamp=? where url=?', ('y', self.timestamp, url))
def fetchall_uncrawled_zf(self):
'''
(url, zf_url, zf_status)
'''
return self.database.fetch_data('select url, zf_url, zf_status from xiaoqu where zf_status="n" and zf_url != ""')
def update_zufang_crawled(self, url):
return self.database.execute('update xiaoqu set zf_status=?, timestamp=? where url=?', ('y', self.timestamp, url))
@property
def failed_list(self):
return self.failedpage_list
class ChengjiaoCrawler(Crawler):
def __init__(self, database):
Crawler.__init__(self)
self.database = database
self.failedpage_list = []
self.did_start_thread_pool = []
def crawl(self, url):
code = Crawler.crawl(self, url)
if code == -2 or code == -1:
self.failedpage_list.append(url);
return code
def extract_func(self, soup, res):
try:
div = soup.find('div', {'class':'page-box house-lst-page-box'})
if div == None:
# 小区成交量为 0
return 0
if not self.did_start_thread_pool:
self.did_start_thread_pool = True
page_dic = 'page_dic=' + div['page-data']
exec(page_dic)
total_pages = page_dic['totalPage']
url_need_handle = urlparse.urlparse(res.request.url)[0] + '://' + urlparse.urlparse(res.request.url)[1]
url_need_handle = url_need_handle + div['page-url']
page_urls = []
for i in range(1, total_pages+1):
page_urls.append(url_need_handle.replace('{page}', str(i)))
if len(page_urls) > 0:
self.start_thread_pool(self.crawl, page_urls)
else:
self.extract_chengjiao(soup)
except Exception as e:
print '解析成交页数失败 e:{}'.format(e)
finally:
return 0
def extract_chengjiao(self, soup):
li_tags = soup.find('ul', {'class' : 'listContent'}).find_all('li')
for li_tag in li_tags:
info_dict = {}
tag = li_tag.find('div', {'class' : 'title'}).find('a', href=True)
if tag == None:
# 无成交详情数据, 忽略
continue
info_dict['url'] = tag['href']
info_dict['timestamp'] = self.timestamp
try:
title_array = tag.text.strip().split(' ')
info_dict['xq_name'], info_dict['house_type'], info_dict['size'] = title_array
tag = li_tag.find('img', {'class' : 'lj-lazy'})
if tag:
info_dict['img'] = tag['src']
tag = li_tag.find('div', {'class' : 'houseInfo'})
if tag:
house_info_content = tag.renderContents().strip().decode('utf-8')
house_info_text = re.match('.+</span>(.*)', house_info_content).group(1)
house_info_text = house_info_text.replace(' ', '')
house_infos = house_info_text.split('|')
info_dict['face'], info_dict['decorate'], info_dict['lift'] = house_infos
tag = li_tag.find('div', {'class' : 'positionInfo'})
if tag:
house_info_content = tag.renderContents().strip().decode('utf-8')
house_info_text = re.match('.+</span>(.*)', house_info_content).group(1)
info_dict['time'] = house_info_text
tag = li_tag.find('span', {'class' : 'dealHouseTxt'})
if tag:
span_tags = tag.find_all('span')
if len(span_tags) > 0:
info_dict['subway'] = span_tags[-1].text.strip()
tag = li_tag.find('div', {'class' : 'dealDate'})
if tag:
info_dict['deal_time'] = tag.text.strip()
tag = li_tag.find('div', {'class' : 'totalPrice'}).find('span')
if tag:
info_dict['price'] = tag.text.strip()
tag = li_tag.find('div', {'class' : 'unitPrice'})
if tag:
info_dict['price_unit'] = tag.find('span').text.strip()
except Exception, e:
print '{} 成交纪录信息提取异常 e: {}'.format(info_dict['url'], e)
params = self.database.format_insert_params(['url', 'xq_name', 'img', 'house_type',
'size', 'face', 'decorate', 'lift',
'time', 'subway', 'deal_time', 'price',
'price_unit', 'timestamp'], info_dict)
self.database.execute('insert into chengjiao values(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)', params)
class ZufangCrawler(Crawler):
def __init__(self, database):
Crawler.__init__(self)
self.database = database
self.failedpage_list = []
self.did_start_thread_pool = []
def crawl(self, url):
code = Crawler.crawl(self, url)
if code == -2 or code == -1:
self.failedpage_list.append(url);
return code
def extract_func(self, soup, res):
try:
div = soup.find('div', {'class':'page-box house-lst-page-box'})
if div == None:
# 小区成交量为 0
return 0
if not self.did_start_thread_pool:
self.did_start_thread_pool = True
page_dic = 'page_dic=' + div['page-data']
exec(page_dic)
total_pages = page_dic['totalPage']
url_need_handle = urlparse.urlparse(res.request.url)[0] + '://' + urlparse.urlparse(res.request.url)[1]
url_need_handle = url_need_handle + div['page-url']
page_urls = []
for i in range(1, total_pages+1):
page_urls.append(url_need_handle.replace('{page}', str(i)))
if len(page_urls) > 0:
self.start_thread_pool(self.crawl, page_urls)
else:
self.extract_zufang(soup)
except Exception as e:
print '解析租房页数失败 e:{}'.format(e)
finally:
return 0
def extract_zufang(self, soup):
li_tags = soup.find('ul', {'class' : 'house-lst'}).find_all('li')
for li_tag in li_tags:
info_dict = {}
tag = li_tag.find('div', {'class' : 'pic-panel'}).find('a', href=True)
if tag == None:
# 无租房详情数据, 忽略
continue
info_dict['url'] = tag['href']
info_dict['img'] = tag.find('img')['src']
info_dict['timestamp'] = self.timestamp
info_dict['ziru'] = 'y' if li_tag.find('div', {'class': 'ziroomTag zufang_ziroom'}) else 'n'
try:
tag = li_tag.find('div', {'class' : 'info-panel'}).find('h2').find('a')
info_dict['name'] = tag.text
where_tag = li_tag.find('div', {'class' : 'col-1'}).find('div', {'class' : 'where'})
tag = where_tag.find('span', {'class' : 'region'})
info_dict['xq_name'] = tag.text.strip()
tag = where_tag.find('span', {'class' : 'zone'}).find('span')
info_dict['house_type'] = tag.text.strip()
tag = where_tag.find('span', {'class' : 'meters'})
info_dict['size'] = re.search('\d+', tag.text).group()
info_dict['face'] = tag.next_sibling.text.strip()
where_tag = li_tag.find('div', {'class' : 'col-1'}).find('div', {'class' : 'con'})
tag = where_tag.find('a')
info_dict['group'] = tag.text.strip()
info_dict['group_url'] = tag['href']
tag = tag.next_sibling
info_dict['floor'] = tag.next_sibling
tag = tag.next_sibling.next_sibling
info_dict['time'] = re.search('\d*', tag.next_sibling).group()
try:
where_tag = li_tag.find('div', {'class' : 'col-1'}).find('div', {'class' : 'view-label left'})
tag = where_tag.find('span', {'class' : 'decoration-ex'})
if tag:
info_dict['decorate'] = tag.find('span').text
tag = where_tag.find('span', {'class' : 'heating-ex'})
if tag:
info_dict['heat'] = tag.find('span').text
tag = where_tag.find('span', {'class' : 'fang-subway-ex'})
if tag:
info_dict['subway'] = tag.find('span').text
except Exception as e:
pass
where_tag = li_tag.find('div', {'class' : 'col-3'})
tag = where_tag.find('span', {'class' : 'num'})
info_dict['price'] = tag.text
tag = where_tag.find('div', {'class' : 'price-pre'})
info_dict['update_time'] = tag.text.strip().split(' ')[0]
except Exception, e:
print '{} 租房纪录信息提取异常 e: {}'.format(info_dict['url'], e)
params = self.database.format_insert_params(['url', 'xq_name', 'name', 'img', 'ziru',
'house_type', 'size', 'face', 'group_name',
'group_url', 'floor', 'time',
'decorate', 'heat', 'subway', 'update_time',
'price', 'timestamp'], info_dict)
self.database.execute('insert into zufang values(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)', params)
def init_database(db_path='bj.db'):
db = SQLiteHelper(db_path)
command = 'create table if not exists chengqu(name text primary key unique, \
url text, \
status text,\
city text,\
timestamp text)'
db.execute(command)
command = ''
command = 'create table if not exists xiaoqu(url text primary key unique, \
img text,\
name text,\
district text,\
bizcircle text, \
type text,\
time TEXT,\
cj_url TEXT,\
zf_url TEXT,\
cj_status text,\
zf_status text,\
timestamp text)'
db.execute(command)
command = 'create table if not exists chengjiao(url TEXT primary key UNIQUE,\
xq_name TEXT,\
img text,\
house_type TEXT, \
size TEXT, \
face TEXT, \
decorate TEXT, \
lift TEXT, \
time TEXT, \
subway TEXT, \
deal_time TEXT, \
price TEXT, \
price_unit TEXT,\
timestamp text)'
db.execute(command)
command = 'create table if not exists zufang(url TEXT primary key UNIQUE,\
xq_name TEXT,\
name TEXT,\
img TEXT,\
ziru text,\
house_type TEXT, \
size TEXT, \
face TEXT, \
group_name TEXT, \
group_url TEXT,\
floor TEXT, \
time TEXT, \
decorate TEXT,\
heat TEXT,\
subway TEXT, \
update_time TEXT, \
price TEXT, \
timestamp text)'
db.execute(command)
return db
def main():
auto_proxy = Proxy()
proxy_pool = auto_proxy.get_proxy()
db = init_database()
chengqu = ChengquCrawler(db)
if chengqu.crawl() != 0:
print u'爬取城区失败'
result = chengqu.fetchall()
for cq in result:
if cq[2] == 'n': # 未被爬的城区下的各个小区
xq = XiaoquCrawler(db)
xq.crawl(cq[1])
if len(xq.failed_list) == 0: # 爬取成功, 更新城区的状态为 y
chengqu.update_chengqu_crawled(cq[1])
print u'{} 城区爬取成功'.format(cq[0])
else:
print(u'以下链接未爬取成功{}'.format(str(xq.failed_list)))
xiaoqu = XiaoquCrawler(db)
result = xiaoqu.fetchall_uncrawled_cj() # 未被爬的城区下的各个小区
count = 0
for xq in result: # xq = (url, cj_url, cj_status)
proxy_pool = auto_proxy.get_proxy()
cj = ChengjiaoCrawler(db)
if count != 0 and count % 2 == 0:
cj.proxies_list = proxy_pool
else:
cj.proxies_list = []
if count != 0 and count % 100 == 0:
print(u'暂停 1 分钟')
time.sleep(1*60)
code = cj.crawl(xq[1])
if code == -2:
print(u'ip 被封,暂停 30 分钟')
time.sleep(30 * 60)
if len(cj.failedpage_list) == 0: # 爬取成功, 更新城区的状态为 y
xiaoqu.update_chengjiao_crawled(xq[0])
count += 1
print u'已爬取 {}/{}\n'.format(str(count), str(len(result))),
else:
print(u'以下链接未爬取成功{}\n重新爬取。。。'.format(str(cj.failedpage_list)))
cj.failedpage_list = []
cj.proxies_list = [] # 基本是因为代理导致的链接失败
cj.start_thread_pool(cj.crawl, cj.failedpage_list)
if len(cj.failedpage_list) == 0:
print(u'重新爬取成功!')
xiaoqu.update_chengjiao_crawled(xq[0])
count += 1
print u'已爬取 {}/{}\n'.format(str(count), str(len(result))),
del cj
result = xiaoqu.fetchall_uncrawled_zf()
count = 0
for xq in result:
proxy_pool = auto_proxy.get_proxy()
zf = ZufangCrawler(db)
if count != 0 and count % 2 == 0:
zf.proxies_list = proxy_pool
else:
zf.proxies_list = []
if count != 0 and count % 100 == 0:
print(u'暂停 1 分钟')
time.sleep(1*60)
code = zf.crawl(xq[1])
if code == -2:
print(u'ip 被封,暂停 30 分钟')
time.sleep(30 * 60)
if len(zf.failedpage_list) == 0: # 爬取成功, 更新小区的状态为 y
xiaoqu.update_chengjiao_crawled(xq[0])
count += 1
print u'已爬取 {}/{}\n'.format(str(count), str(len(result))),
else:
print(u'以下链接未爬取成功{}\n重新爬取。。。'.format(str(zf.failedpage_list)))
zf.failedpage_list = []
zf.proxies_list = [] # 基本是因为代理导致的链接失败
zf.start_thread_pool(zf.crawl, zf.failedpage_list)
if len(zf.failedpage_list) == 0:
print(u'重新爬取成功!')
xiaoqu.update_chengjiao_crawled(xq[0])
count += 1
print u'已爬取 {}/{}\n'.format(str(count), str(len(result))),
del zf
if __name__ == '__main__':
import sys
sys.exit(int(main() or 0))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment