wwj718/tel_num_spider.py

## tel_num_spider.py
import aiohttp
import asyncio
from requests_html import HTML
import tablib
import time

timestamp = int(time.time())
# model
headers = ('url', 'area', 'telephone','address') # 字段
data = []
save_data = tablib.Dataset(*data, headers=headers)

def export_csv(data):
    with open('test_{}.csv'.format(timestamp), 'wb') as f:
        f.write(data.export('csv').encode("utf-8"))


async def fetch(session, url):
    async with session.get(url) as response:
        return await response.text()


def get_data(t_url,html,is_test=False):
    global save_data
    url = t_url
    area = html.find('body > div.main-wrap > div.house-basic-info.clearfix > div.house-basic-right.fr > ul > li:nth-child(1) > span.house_basic_title_content_item2', first=True)
    print("area:",area.full_text)
    telephone = html.find('#houseChatEntry > div > p.phone-num', first=True)
    print("telephone:",telephone.full_text)
    # address
    address = html.find("body > div.main-wrap > div.house-basic-info.clearfix > div.house-basic-right.fr > ul > li:nth-child(6)", first=True)
    address_to_save = ' '.join(address.full_text.split())
    print("address:",address_to_save) # 合并空格
    # 楼层
    if is_test==False:
        save_data.append((url, area.full_text, telephone.full_text,address_to_save))
    # body > div.main-wrap > div.house-basic-info.clearfix > div.house-basic-right.fr > ul > li:nth-child(4) > span.house_basic_title_content_item2


async def get_target_page_data(session,t_url):
            global bad_urls
            print("t_url:",t_url) # http://sh.58.com/shangpu/34391724062893x.shtml
            t_doc = await fetch(session, t_url)
            t_html = HTML(html=t_doc)
            try:
                get_data(t_url, t_html)
            except:
                print("bad url:",t_url) # 收集下来,再一次
                bad_urls.append(t_url)

doc = ""
html = ""
t_html = ""
all_target_urls = []
bad_urls = []

async def main_get_target_urls():
    '''拿到商户主页'''
    url_list = [
                # 黄埔
                "http://sh.58.com/huangpu/shangpucz/?area=1000_2000&sq=1&PGTID=0d306b35-0057-7134-a99d-27cf70aa6504&ClickID=6", # page1
                "http://sh.58.com/huangpu/shangpucz/pn2/?area=1000_2000&sq=1&PGTID=0d306b35-0057-7c3d-057d-33d6738862b6&ClickID=2", # page2
                # 静安
                "http://sh.58.com/jingan/shangpucz/?area=1000_2000&sq=1&PGTID=0d306b35-0057-7701-b196-1d9583e9c1ae&ClickID=2", # 1
                "http://sh.58.com/jingan/shangpucz/pn2/?area=1000_2000&sq=1&PGTID=0d306b35-0057-b5f9-6ebb-5140d63e954c&ClickID=2", # 2
                "http://sh.58.com/jingan/shangpucz/pn3/?area=1000_2000&sq=1&PGTID=0d306b35-0057-bfbc-e51b-f42963cd8e56&ClickID=2", # 3
                # 徐汇
                "http://sh.58.com/xuhui/shangpucz/?area=1000_2000&sq=1&PGTID=0d306b35-0057-bff2-6fc6-6030b511f494&ClickID=2",
                "http://sh.58.com/xuhui/shangpucz/pn2/?area=1000_2000&sq=1&PGTID=0d306b35-0057-9297-bf16-230b883dc749&ClickID=2",
                "http://sh.58.com/xuhui/shangpucz/pn3/?area=1000_2000&sq=1&PGTID=0d306b35-0057-9b8c-c748-8d19d69c83b8&ClickID=2",
    ]

    global doc, html, t_html, all_target_urls  # 用于debug
    async with aiohttp.ClientSession() as session:
      for url in url_list[:]:
        # doc = await fetch(session, 'http://python.org') # 每次 存入
        doc = await fetch(session, url)
        html = HTML(html=doc)
        target_urls = [url for url in html.links if ("sh.58.com/shangpu" in url or "jxjump.58.com/service" in url)]
        # 先拿到所有的 target_urls save下来
        print("url:{} len:{}".format(url, len(target_urls)))
        for t_url in target_urls[:]:
            all_target_urls.append(t_url)
            # get_target_page_data(session,t_url)
async def main(urls):
    '''获取主页信息，并存储'''
    async with aiohttp.ClientSession() as session:
        for t_url in urls:
            await get_target_page_data(session,t_url)


def end_and_clearup():
    # 只需执行一次
    global all_target_urls , bad_urls
    with open("bad_urls_{}.py".format(timestamp),"w") as bad_urls_file:
        content = "bad_urls = " + str(bad_urls)
        bad_urls_file.write(content)


if __name__ == '__main__':
    import bad_urls_1528789277
    loop = asyncio.get_event_loop()
    urls = bad_urls_1528789277.bad_urls # 从这里开始，之后每次使用bad_urls
    loop.run_until_complete(main(urls));
    export_csv(save_data)
    end_and_clearup() # 手动来做
	import aiohttp
	import asyncio
	from requests_html import HTML
	import tablib
	import time

	timestamp = int(time.time())
	# model
	headers = ('url', 'area', 'telephone','address') # 字段
	data = []
	save_data = tablib.Dataset(*data, headers=headers)

	def export_csv(data):
	with open('test_{}.csv'.format(timestamp), 'wb') as f:
	f.write(data.export('csv').encode("utf-8"))


	async def fetch(session, url):
	async with session.get(url) as response:
	return await response.text()


	def get_data(t_url,html,is_test=False):
	global save_data
	url = t_url
	area = html.find('body > div.main-wrap > div.house-basic-info.clearfix > div.house-basic-right.fr > ul > li:nth-child(1) > span.house_basic_title_content_item2', first=True)
	print("area:",area.full_text)
	telephone = html.find('#houseChatEntry > div > p.phone-num', first=True)
	print("telephone:",telephone.full_text)
	# address
	address = html.find("body > div.main-wrap > div.house-basic-info.clearfix > div.house-basic-right.fr > ul > li:nth-child(6)", first=True)
	address_to_save = ' '.join(address.full_text.split())
	print("address:",address_to_save) # 合并空格
	# 楼层
	if is_test==False:
	save_data.append((url, area.full_text, telephone.full_text,address_to_save))
	# body > div.main-wrap > div.house-basic-info.clearfix > div.house-basic-right.fr > ul > li:nth-child(4) > span.house_basic_title_content_item2


	async def get_target_page_data(session,t_url):
	global bad_urls
	print("t_url:",t_url) # http://sh.58.com/shangpu/34391724062893x.shtml
	t_doc = await fetch(session, t_url)
	t_html = HTML(html=t_doc)
	try:
	get_data(t_url, t_html)
	except:
	print("bad url:",t_url) # 收集下来,再一次
	bad_urls.append(t_url)

	doc = ""
	html = ""
	t_html = ""
	all_target_urls = []
	bad_urls = []

	async def main_get_target_urls():
	'''拿到商户主页'''
	url_list = [
	# 黄埔
	"http://sh.58.com/huangpu/shangpucz/?area=1000_2000&sq=1&PGTID=0d306b35-0057-7134-a99d-27cf70aa6504&ClickID=6", # page1
	"http://sh.58.com/huangpu/shangpucz/pn2/?area=1000_2000&sq=1&PGTID=0d306b35-0057-7c3d-057d-33d6738862b6&ClickID=2", # page2
	# 静安
	"http://sh.58.com/jingan/shangpucz/?area=1000_2000&sq=1&PGTID=0d306b35-0057-7701-b196-1d9583e9c1ae&ClickID=2", # 1
	"http://sh.58.com/jingan/shangpucz/pn2/?area=1000_2000&sq=1&PGTID=0d306b35-0057-b5f9-6ebb-5140d63e954c&ClickID=2", # 2
	"http://sh.58.com/jingan/shangpucz/pn3/?area=1000_2000&sq=1&PGTID=0d306b35-0057-bfbc-e51b-f42963cd8e56&ClickID=2", # 3
	# 徐汇
	"http://sh.58.com/xuhui/shangpucz/?area=1000_2000&sq=1&PGTID=0d306b35-0057-bff2-6fc6-6030b511f494&ClickID=2",
	"http://sh.58.com/xuhui/shangpucz/pn2/?area=1000_2000&sq=1&PGTID=0d306b35-0057-9297-bf16-230b883dc749&ClickID=2",
	"http://sh.58.com/xuhui/shangpucz/pn3/?area=1000_2000&sq=1&PGTID=0d306b35-0057-9b8c-c748-8d19d69c83b8&ClickID=2",
	]

	global doc, html, t_html, all_target_urls # 用于debug
	async with aiohttp.ClientSession() as session:
	for url in url_list[:]:
	# doc = await fetch(session, 'http://python.org') # 每次存入
	doc = await fetch(session, url)
	html = HTML(html=doc)
	target_urls = [url for url in html.links if ("sh.58.com/shangpu" in url or "jxjump.58.com/service" in url)]
	# 先拿到所有的 target_urls save下来
	print("url:{} len:{}".format(url, len(target_urls)))
	for t_url in target_urls[:]:
	all_target_urls.append(t_url)
	# get_target_page_data(session,t_url)
	async def main(urls):
	'''获取主页信息，并存储'''
	async with aiohttp.ClientSession() as session:
	for t_url in urls:
	await get_target_page_data(session,t_url)


	def end_and_clearup():
	# 只需执行一次
	global all_target_urls , bad_urls
	with open("bad_urls_{}.py".format(timestamp),"w") as bad_urls_file:
	content = "bad_urls = " + str(bad_urls)
	bad_urls_file.write(content)


	if __name__ == '__main__':
	import bad_urls_1528789277
	loop = asyncio.get_event_loop()
	urls = bad_urls_1528789277.bad_urls # 从这里开始，之后每次使用bad_urls
	loop.run_until_complete(main(urls));
	export_csv(save_data)
	end_and_clearup() # 手动来做