-
-
Save 1oo1/40fc5bb84eaec15fd3471e2fd777b515 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python | |
#coding:utf-8 | |
from bs4 import BeautifulSoup | |
import requests | |
import re | |
import time | |
from multiprocessing.dummy import Pool as ThreadPool | |
class Proxy(object): | |
def __init__(self, max_page=1): | |
self.timestamp = time.time() | |
self.max_page = max_page | |
self.proxies = [] | |
self.checked_proxies = [] | |
self.s = requests.Session() | |
self.headers = { | |
'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', | |
'Accept-Encoding' :'gzip, deflate, sdch', | |
'Accept-Language' : 'zh-CN,zh;q=0.8,en;q=0.6,it;q=0.4,zh-TW;q=0.2', | |
'Connection' : 'keep-alive', | |
'Host' : 'www.xicidaili.com', | |
'Upgrade-Insecure-Requests': '1', | |
'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36' | |
} | |
self.s.headers.update(self.headers) | |
self.url = 'http://www.xicidaili.com/nn/' | |
def _parse_proxy(self): | |
res = self.s.get(self.url) | |
try: | |
soup = BeautifulSoup(res.content, 'html.parser') | |
tr_tags = soup.find('table', {'id' : 'ip_list'}).find_all('tr') | |
for tr in tr_tags: | |
tds = tr.find_all('td') | |
if not tds: # ignore title | |
continue | |
time_tag = tds[6].find('div', {'class' : 'bar'}, title=True) | |
if time_tag: | |
time = re.search('\d+.\d+', str(time_tag)).group() | |
if float(time) > 4: | |
continue | |
ip = tds[1].text | |
port = tds[2].text | |
schema = tds[5].text | |
# conn_time = | |
if schema == 'HTTPS': | |
self.proxies.append({'https' : ip + ':' + port}) | |
else: | |
self.proxies.append({'http' : ip + ':' + port}) | |
except Exception as e: | |
print(u'代理获取异常:{}'.format(e)) | |
def _check_proxy(self, proxy, anonymous=True): | |
try: | |
r = requests.get('http://httpbin.org/ip', proxies=proxy, timeout=3) | |
data = r.json() | |
# 高匿检测 | |
if anonymous: | |
if data['origin'] == proxy.values()[0].split(':')[0]: | |
self.checked_proxies.append(proxy) | |
else: | |
self.checked_proxies.append(proxy) | |
except Exception as e: | |
pass | |
def get_proxy(self): | |
time_distance = time.time() - self.timestamp | |
if len(self.checked_proxies) == 0 or time_distance > 60 * 30: | |
print(u'刷新代理池...') | |
self.proxies = [] | |
self.checked_proxies = [] | |
self._parse_proxy() | |
pool = ThreadPool(8) | |
pool.map(self._check_proxy, self.proxies) | |
pool.close() | |
pool.join() | |
self.timestamp = time.time() | |
print(u'新的代理池: \n{}\n'.format(self.checked_proxies)) | |
return self.checked_proxies | |
if __name__ == '__main__': | |
ins = Proxy() | |
print ins.get_proxy() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python | |
#coding:utf-8 | |
import threading | |
import sqlite3 | |
import random | |
import urlparse | |
import re | |
import time | |
from datetime import datetime | |
from requests import Session | |
from requests.exceptions import ProxyError, ConnectTimeout | |
from bs4 import BeautifulSoup | |
from multiprocessing.dummy import Pool | |
from auto_proxy import Proxy | |
class SQLiteHelper: | |
def __init__(self, db_path): | |
self.db_path = db_path | |
def conn_transaction(func): | |
def connection(self, *args, **kwargs): | |
conn = sqlite3.connect(self.db_path) | |
kwargs['conn'] = conn | |
result = func(self, *args, **kwargs) | |
conn.close() | |
return result | |
return connection | |
@conn_transaction | |
def execute(self, command, params=None, conn=None): | |
cursor = conn.cursor() | |
result = 0 | |
try: | |
if params: | |
cursor.execute(command, params) | |
else: | |
cursor.execute(command) | |
conn.commit() | |
except Exception as e: | |
print e | |
conn.rollback() | |
result = -1 | |
finally: | |
cursor.close() | |
return result | |
@conn_transaction | |
def fetch_data(self, command, conn=None): | |
cursor = conn.cursor() | |
result = [] | |
try: | |
cursor.execute(command) | |
result = cursor.fetchall() | |
except Exception as e: | |
print e | |
finally: | |
cursor.close() | |
return result | |
def format_insert_params(self, column_name_list, params): | |
result = [] | |
for col in column_name_list: | |
if col in params: | |
result.append(params[col]) | |
else: | |
result.append('') | |
return tuple(result) | |
HEADER_UA_LIST = [{'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'},\ | |
{'User-Agent':'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.12 Safari/535.11'},\ | |
{'User-Agent':'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Trident/6.0)'},\ | |
{'User-Agent':'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:34.0) Gecko/20100101 Firefox/34.0'},\ | |
{'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/44.0.2403.89 Chrome/44.0.2403.89 Safari/537.36'},\ | |
{'User-Agent':'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50'},\ | |
{'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50'},\ | |
{'User-Agent':'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0'},\ | |
{'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1'},\ | |
{'User-Agent':'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1'},\ | |
{'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11'},\ | |
{'User-Agent':'Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11'},\ | |
{'User-Agent':'Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11'}] | |
class Crawler: | |
def __init__(self): | |
self.proxies_list = [] | |
def crawl(self, url): | |
proxy = None | |
if len(self.proxies_list) > 0: | |
proxy = self.proxies_list[random.randint(0, len(self.proxies_list) - 1)] | |
print(u'使用代理 {}'.format(self.timestamp)) | |
try: | |
res = Session().get(url, headers=HEADER_UA_LIST[random.randint(0, len(HEADER_UA_LIST) - 1)], proxies=proxy, timeout=15) | |
soup = BeautifulSoup(res.content, 'html.parser') | |
soup_content = soup.encode_contents(encoding='utf-8') | |
if soup_content.find(u'您所在的IP流量异常'.encode('utf-8')) != -1: | |
return -2 | |
except (ProxyError, ConnectTimeout): | |
self.proxies_list.remove(proxy) | |
self.crawl(url) | |
except Exception as e: | |
print 'Crawl {} error: {}'.format(url, e) | |
return -1 | |
else: | |
return self.extract_func(soup, res) | |
def start_thread_pool(self, func, args, nums=8): | |
thread_pool = Pool(processes=nums) | |
thread_pool.map(func, args) | |
thread_pool.close() | |
thread_pool.join() | |
del thread_pool | |
def extract_func(self, soup, res): | |
pass # 需要子类实现 | |
@property | |
def timestamp(self): | |
return datetime.now().strftime('%Y-%m-%d %H:%M;%S') | |
class ChengquCrawler(Crawler): | |
def __init__(self, database, base_url='http://bj.lianjia.com', path='/xiaoqu/'): | |
Crawler.__init__(self) | |
self.base_url = base_url | |
self.path = path | |
self.database = database | |
def crawl(self): | |
result = self.fetchall() | |
if len(result) == 0: | |
return Crawler.crawl(self, self.base_url + self.path) | |
else: | |
return 0 | |
def extract_func(self, soup, res): | |
try: | |
tag = soup.find('div', {'class' : 'position'}).find('a', {'href' : '/xiaoqu/'}) | |
city = tag['title'].replace(u'小区区域', '') | |
except Exception as e: | |
print '解析城市失败 {}'.format(e) | |
return -1 | |
else: | |
try: | |
tag_list = soup.find('div', {'data-role' : 'ershoufang'}).find_all('a', href=True) | |
info_dict = {} | |
for t in tag_list: | |
try: | |
info_dict['url'] = self.base_url + t['href'] | |
info_dict['name'] = t.text | |
info_dict['city'] = city | |
info_dict['status'] = 'n' | |
info_dict['timestamp'] = self.timestamp | |
except Exception as e: | |
print u'{} 解析城区失败 e: {}'.format(str(info_dict), e) | |
continue | |
else: | |
params = self.database.format_insert_params(['name', 'url', 'status', 'city', 'timestamp'], info_dict) | |
self.database.execute('insert into chengqu values(?, ?, ?, ?, ?)', params) | |
except Exception as e: | |
print e | |
return -1 | |
return 0 | |
def fetchall(self): | |
return self.database.fetch_data('select * from chengqu') | |
def update_chengqu_crawled(self, url): | |
return self.database.execute('update chengqu set status=?, timestamp=? where url=?', ('y', self.timestamp, url)) | |
class XiaoquCrawler(Crawler): | |
def __init__(self, database): | |
Crawler.__init__(self) | |
self.database = database | |
self.failedpage_list = [] | |
self.did_start_thread_pool = False | |
def crawl(self, url): | |
code = Crawler.crawl(self, url) | |
if code == -2 or code == -1: | |
self.failedpage_list.append(url); | |
return code | |
def extract_func(self, soup, res): | |
try: | |
if not self.did_start_thread_pool: | |
self.did_start_thread_pool = True | |
div = soup.find('div', {'class':'page-box house-lst-page-box'}) | |
page_dic = 'page_dic=' + div['page-data'] | |
exec(page_dic) | |
total_pages = page_dic['totalPage'] | |
url_need_handle = urlparse.urlparse(res.request.url)[0] + '://' + urlparse.urlparse(res.request.url)[1] | |
url_need_handle = url_need_handle + div['page-url'] | |
page_urls = [] | |
for i in range(1, total_pages+1): | |
page_urls.append(url_need_handle.replace('{page}', str(i))) | |
if len(page_urls) > 0: | |
self.start_thread_pool(self.crawl, page_urls) | |
else: | |
self.extract_xiaoqu(soup) | |
except Exception as e: | |
print '解析小区页数失败 e:{}'.format(e) | |
finally: | |
return 0 | |
def extract_xiaoqu(self, soup): | |
xiaoqu_list = soup.find_all('li', {'class' : 'clear'}) | |
for xiaoqu in xiaoqu_list: | |
info_dict = {} | |
try: | |
info_dict['name'] = xiaoqu.find('div', {'class' : 'title'}).find('a').text | |
info_dict['url'] = xiaoqu.find('div', {'class' : 'title'}).find('a')['href'] | |
try: | |
info_dict['img'] = xiaoqu.find('img', {'class' : 'lj-lazy'})['src'] | |
info_dict['district'] = xiaoqu.find('div', {'class' : 'positionInfo'}).find('a', {'class' : 'district'}).text | |
info_dict['bizcircle'] = xiaoqu.find('div', {'class' : 'positionInfo'}).find('a', {'class' : 'bizcircle'}).text | |
re_string = xiaoqu.find('div', {'class' : 'positionInfo'}).renderContents().strip().decode('utf-8') | |
re_match = re.split('</a>', re_string) | |
if len(re_match) > 1: | |
type_time = re_match[-1].strip().lstrip('/').split('/') | |
if len(type_time) >= 2: | |
xiaoqu_type = '/'.join(type_time[0:-1]) | |
xiaoqu_time = type_time[-1].strip() | |
xiaoqu_time = re.search('\d*', xiaoqu_time).group() | |
info_dict['type'] = xiaoqu_type | |
info_dict['time'] = xiaoqu_time | |
house_info = xiaoqu.find('div', {'class' : 'houseInfo'}) | |
for a_tag in house_info.find_all('a', href=True): | |
href = a_tag['href'] | |
if re.match('.*/chengjiao/.*', href): | |
info_dict['cj_url'] = href | |
elif re.match('.*/zufang/.*', href): | |
info_dict['zf_url'] = href | |
except: | |
print u'{} 某项信息提取失败, e:{}'.format(info_dict['name']) | |
info_dict['cj_status'] = 'n' | |
info_dict['zf_status'] = 'n' | |
info_dict['timestamp'] = self.timestamp | |
params = self.database.format_insert_params(['url', 'img', 'name', 'district', 'bizcircle', 'type', 'time', 'cj_url', 'zf_url', 'cj_status', 'zf_status', 'timestamp'], info_dict) | |
self.database.execute('insert into xiaoqu values(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)', params) | |
except Exception as e: | |
print(u'提取某个小区失败; e: {}\n {}'.format(e, str(xiaoqu).decode('utf-8'))) | |
continue | |
def fetchall(self): | |
''' | |
(url, cj_url, cj_status, zf_url, zf_status, name) | |
''' | |
return self.database.fetch_data('select url, cj_url, cj_status, zf_url, zf_status, name from xiaoqu') | |
def fetchall_uncrawled_cj(self): | |
''' | |
(url, cj_url, cj_status) | |
''' | |
return self.database.fetch_data('select url, cj_url, cj_status from xiaoqu where cj_status="n"') | |
def update_chengjiao_crawled(self, url): | |
return self.database.execute('update xiaoqu set cj_status=?, timestamp=? where url=?', ('y', self.timestamp, url)) | |
def fetchall_uncrawled_zf(self): | |
''' | |
(url, zf_url, zf_status) | |
''' | |
return self.database.fetch_data('select url, zf_url, zf_status from xiaoqu where zf_status="n" and zf_url != ""') | |
def update_zufang_crawled(self, url): | |
return self.database.execute('update xiaoqu set zf_status=?, timestamp=? where url=?', ('y', self.timestamp, url)) | |
@property | |
def failed_list(self): | |
return self.failedpage_list | |
class ChengjiaoCrawler(Crawler): | |
def __init__(self, database): | |
Crawler.__init__(self) | |
self.database = database | |
self.failedpage_list = [] | |
self.did_start_thread_pool = [] | |
def crawl(self, url): | |
code = Crawler.crawl(self, url) | |
if code == -2 or code == -1: | |
self.failedpage_list.append(url); | |
return code | |
def extract_func(self, soup, res): | |
try: | |
div = soup.find('div', {'class':'page-box house-lst-page-box'}) | |
if div == None: | |
# 小区成交量为 0 | |
return 0 | |
if not self.did_start_thread_pool: | |
self.did_start_thread_pool = True | |
page_dic = 'page_dic=' + div['page-data'] | |
exec(page_dic) | |
total_pages = page_dic['totalPage'] | |
url_need_handle = urlparse.urlparse(res.request.url)[0] + '://' + urlparse.urlparse(res.request.url)[1] | |
url_need_handle = url_need_handle + div['page-url'] | |
page_urls = [] | |
for i in range(1, total_pages+1): | |
page_urls.append(url_need_handle.replace('{page}', str(i))) | |
if len(page_urls) > 0: | |
self.start_thread_pool(self.crawl, page_urls) | |
else: | |
self.extract_chengjiao(soup) | |
except Exception as e: | |
print '解析成交页数失败 e:{}'.format(e) | |
finally: | |
return 0 | |
def extract_chengjiao(self, soup): | |
li_tags = soup.find('ul', {'class' : 'listContent'}).find_all('li') | |
for li_tag in li_tags: | |
info_dict = {} | |
tag = li_tag.find('div', {'class' : 'title'}).find('a', href=True) | |
if tag == None: | |
# 无成交详情数据, 忽略 | |
continue | |
info_dict['url'] = tag['href'] | |
info_dict['timestamp'] = self.timestamp | |
try: | |
title_array = tag.text.strip().split(' ') | |
info_dict['xq_name'], info_dict['house_type'], info_dict['size'] = title_array | |
tag = li_tag.find('img', {'class' : 'lj-lazy'}) | |
if tag: | |
info_dict['img'] = tag['src'] | |
tag = li_tag.find('div', {'class' : 'houseInfo'}) | |
if tag: | |
house_info_content = tag.renderContents().strip().decode('utf-8') | |
house_info_text = re.match('.+</span>(.*)', house_info_content).group(1) | |
house_info_text = house_info_text.replace(' ', '') | |
house_infos = house_info_text.split('|') | |
info_dict['face'], info_dict['decorate'], info_dict['lift'] = house_infos | |
tag = li_tag.find('div', {'class' : 'positionInfo'}) | |
if tag: | |
house_info_content = tag.renderContents().strip().decode('utf-8') | |
house_info_text = re.match('.+</span>(.*)', house_info_content).group(1) | |
info_dict['time'] = house_info_text | |
tag = li_tag.find('span', {'class' : 'dealHouseTxt'}) | |
if tag: | |
span_tags = tag.find_all('span') | |
if len(span_tags) > 0: | |
info_dict['subway'] = span_tags[-1].text.strip() | |
tag = li_tag.find('div', {'class' : 'dealDate'}) | |
if tag: | |
info_dict['deal_time'] = tag.text.strip() | |
tag = li_tag.find('div', {'class' : 'totalPrice'}).find('span') | |
if tag: | |
info_dict['price'] = tag.text.strip() | |
tag = li_tag.find('div', {'class' : 'unitPrice'}) | |
if tag: | |
info_dict['price_unit'] = tag.find('span').text.strip() | |
except Exception, e: | |
print '{} 成交纪录信息提取异常 e: {}'.format(info_dict['url'], e) | |
params = self.database.format_insert_params(['url', 'xq_name', 'img', 'house_type', | |
'size', 'face', 'decorate', 'lift', | |
'time', 'subway', 'deal_time', 'price', | |
'price_unit', 'timestamp'], info_dict) | |
self.database.execute('insert into chengjiao values(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)', params) | |
class ZufangCrawler(Crawler): | |
def __init__(self, database): | |
Crawler.__init__(self) | |
self.database = database | |
self.failedpage_list = [] | |
self.did_start_thread_pool = [] | |
def crawl(self, url): | |
code = Crawler.crawl(self, url) | |
if code == -2 or code == -1: | |
self.failedpage_list.append(url); | |
return code | |
def extract_func(self, soup, res): | |
try: | |
div = soup.find('div', {'class':'page-box house-lst-page-box'}) | |
if div == None: | |
# 小区成交量为 0 | |
return 0 | |
if not self.did_start_thread_pool: | |
self.did_start_thread_pool = True | |
page_dic = 'page_dic=' + div['page-data'] | |
exec(page_dic) | |
total_pages = page_dic['totalPage'] | |
url_need_handle = urlparse.urlparse(res.request.url)[0] + '://' + urlparse.urlparse(res.request.url)[1] | |
url_need_handle = url_need_handle + div['page-url'] | |
page_urls = [] | |
for i in range(1, total_pages+1): | |
page_urls.append(url_need_handle.replace('{page}', str(i))) | |
if len(page_urls) > 0: | |
self.start_thread_pool(self.crawl, page_urls) | |
else: | |
self.extract_zufang(soup) | |
except Exception as e: | |
print '解析租房页数失败 e:{}'.format(e) | |
finally: | |
return 0 | |
def extract_zufang(self, soup): | |
li_tags = soup.find('ul', {'class' : 'house-lst'}).find_all('li') | |
for li_tag in li_tags: | |
info_dict = {} | |
tag = li_tag.find('div', {'class' : 'pic-panel'}).find('a', href=True) | |
if tag == None: | |
# 无租房详情数据, 忽略 | |
continue | |
info_dict['url'] = tag['href'] | |
info_dict['img'] = tag.find('img')['src'] | |
info_dict['timestamp'] = self.timestamp | |
info_dict['ziru'] = 'y' if li_tag.find('div', {'class': 'ziroomTag zufang_ziroom'}) else 'n' | |
try: | |
tag = li_tag.find('div', {'class' : 'info-panel'}).find('h2').find('a') | |
info_dict['name'] = tag.text | |
where_tag = li_tag.find('div', {'class' : 'col-1'}).find('div', {'class' : 'where'}) | |
tag = where_tag.find('span', {'class' : 'region'}) | |
info_dict['xq_name'] = tag.text.strip() | |
tag = where_tag.find('span', {'class' : 'zone'}).find('span') | |
info_dict['house_type'] = tag.text.strip() | |
tag = where_tag.find('span', {'class' : 'meters'}) | |
info_dict['size'] = re.search('\d+', tag.text).group() | |
info_dict['face'] = tag.next_sibling.text.strip() | |
where_tag = li_tag.find('div', {'class' : 'col-1'}).find('div', {'class' : 'con'}) | |
tag = where_tag.find('a') | |
info_dict['group'] = tag.text.strip() | |
info_dict['group_url'] = tag['href'] | |
tag = tag.next_sibling | |
info_dict['floor'] = tag.next_sibling | |
tag = tag.next_sibling.next_sibling | |
info_dict['time'] = re.search('\d*', tag.next_sibling).group() | |
try: | |
where_tag = li_tag.find('div', {'class' : 'col-1'}).find('div', {'class' : 'view-label left'}) | |
tag = where_tag.find('span', {'class' : 'decoration-ex'}) | |
if tag: | |
info_dict['decorate'] = tag.find('span').text | |
tag = where_tag.find('span', {'class' : 'heating-ex'}) | |
if tag: | |
info_dict['heat'] = tag.find('span').text | |
tag = where_tag.find('span', {'class' : 'fang-subway-ex'}) | |
if tag: | |
info_dict['subway'] = tag.find('span').text | |
except Exception as e: | |
pass | |
where_tag = li_tag.find('div', {'class' : 'col-3'}) | |
tag = where_tag.find('span', {'class' : 'num'}) | |
info_dict['price'] = tag.text | |
tag = where_tag.find('div', {'class' : 'price-pre'}) | |
info_dict['update_time'] = tag.text.strip().split(' ')[0] | |
except Exception, e: | |
print '{} 租房纪录信息提取异常 e: {}'.format(info_dict['url'], e) | |
params = self.database.format_insert_params(['url', 'xq_name', 'name', 'img', 'ziru', | |
'house_type', 'size', 'face', 'group_name', | |
'group_url', 'floor', 'time', | |
'decorate', 'heat', 'subway', 'update_time', | |
'price', 'timestamp'], info_dict) | |
self.database.execute('insert into zufang values(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)', params) | |
def init_database(db_path='bj.db'): | |
db = SQLiteHelper(db_path) | |
command = 'create table if not exists chengqu(name text primary key unique, \ | |
url text, \ | |
status text,\ | |
city text,\ | |
timestamp text)' | |
db.execute(command) | |
command = '' | |
command = 'create table if not exists xiaoqu(url text primary key unique, \ | |
img text,\ | |
name text,\ | |
district text,\ | |
bizcircle text, \ | |
type text,\ | |
time TEXT,\ | |
cj_url TEXT,\ | |
zf_url TEXT,\ | |
cj_status text,\ | |
zf_status text,\ | |
timestamp text)' | |
db.execute(command) | |
command = 'create table if not exists chengjiao(url TEXT primary key UNIQUE,\ | |
xq_name TEXT,\ | |
img text,\ | |
house_type TEXT, \ | |
size TEXT, \ | |
face TEXT, \ | |
decorate TEXT, \ | |
lift TEXT, \ | |
time TEXT, \ | |
subway TEXT, \ | |
deal_time TEXT, \ | |
price TEXT, \ | |
price_unit TEXT,\ | |
timestamp text)' | |
db.execute(command) | |
command = 'create table if not exists zufang(url TEXT primary key UNIQUE,\ | |
xq_name TEXT,\ | |
name TEXT,\ | |
img TEXT,\ | |
ziru text,\ | |
house_type TEXT, \ | |
size TEXT, \ | |
face TEXT, \ | |
group_name TEXT, \ | |
group_url TEXT,\ | |
floor TEXT, \ | |
time TEXT, \ | |
decorate TEXT,\ | |
heat TEXT,\ | |
subway TEXT, \ | |
update_time TEXT, \ | |
price TEXT, \ | |
timestamp text)' | |
db.execute(command) | |
return db | |
def main(): | |
auto_proxy = Proxy() | |
proxy_pool = auto_proxy.get_proxy() | |
db = init_database() | |
chengqu = ChengquCrawler(db) | |
if chengqu.crawl() != 0: | |
print u'爬取城区失败' | |
result = chengqu.fetchall() | |
for cq in result: | |
if cq[2] == 'n': # 未被爬的城区下的各个小区 | |
xq = XiaoquCrawler(db) | |
xq.crawl(cq[1]) | |
if len(xq.failed_list) == 0: # 爬取成功, 更新城区的状态为 y | |
chengqu.update_chengqu_crawled(cq[1]) | |
print u'{} 城区爬取成功'.format(cq[0]) | |
else: | |
print(u'以下链接未爬取成功{}'.format(str(xq.failed_list))) | |
xiaoqu = XiaoquCrawler(db) | |
result = xiaoqu.fetchall_uncrawled_cj() # 未被爬的城区下的各个小区 | |
count = 0 | |
for xq in result: # xq = (url, cj_url, cj_status) | |
proxy_pool = auto_proxy.get_proxy() | |
cj = ChengjiaoCrawler(db) | |
if count != 0 and count % 2 == 0: | |
cj.proxies_list = proxy_pool | |
else: | |
cj.proxies_list = [] | |
if count != 0 and count % 100 == 0: | |
print(u'暂停 1 分钟') | |
time.sleep(1*60) | |
code = cj.crawl(xq[1]) | |
if code == -2: | |
print(u'ip 被封,暂停 30 分钟') | |
time.sleep(30 * 60) | |
if len(cj.failedpage_list) == 0: # 爬取成功, 更新城区的状态为 y | |
xiaoqu.update_chengjiao_crawled(xq[0]) | |
count += 1 | |
print u'已爬取 {}/{}\n'.format(str(count), str(len(result))), | |
else: | |
print(u'以下链接未爬取成功{}\n重新爬取。。。'.format(str(cj.failedpage_list))) | |
cj.failedpage_list = [] | |
cj.proxies_list = [] # 基本是因为代理导致的链接失败 | |
cj.start_thread_pool(cj.crawl, cj.failedpage_list) | |
if len(cj.failedpage_list) == 0: | |
print(u'重新爬取成功!') | |
xiaoqu.update_chengjiao_crawled(xq[0]) | |
count += 1 | |
print u'已爬取 {}/{}\n'.format(str(count), str(len(result))), | |
del cj | |
result = xiaoqu.fetchall_uncrawled_zf() | |
count = 0 | |
for xq in result: | |
proxy_pool = auto_proxy.get_proxy() | |
zf = ZufangCrawler(db) | |
if count != 0 and count % 2 == 0: | |
zf.proxies_list = proxy_pool | |
else: | |
zf.proxies_list = [] | |
if count != 0 and count % 100 == 0: | |
print(u'暂停 1 分钟') | |
time.sleep(1*60) | |
code = zf.crawl(xq[1]) | |
if code == -2: | |
print(u'ip 被封,暂停 30 分钟') | |
time.sleep(30 * 60) | |
if len(zf.failedpage_list) == 0: # 爬取成功, 更新小区的状态为 y | |
xiaoqu.update_chengjiao_crawled(xq[0]) | |
count += 1 | |
print u'已爬取 {}/{}\n'.format(str(count), str(len(result))), | |
else: | |
print(u'以下链接未爬取成功{}\n重新爬取。。。'.format(str(zf.failedpage_list))) | |
zf.failedpage_list = [] | |
zf.proxies_list = [] # 基本是因为代理导致的链接失败 | |
zf.start_thread_pool(zf.crawl, zf.failedpage_list) | |
if len(zf.failedpage_list) == 0: | |
print(u'重新爬取成功!') | |
xiaoqu.update_chengjiao_crawled(xq[0]) | |
count += 1 | |
print u'已爬取 {}/{}\n'.format(str(count), str(len(result))), | |
del zf | |
if __name__ == '__main__': | |
import sys | |
sys.exit(int(main() or 0)) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment