Created
September 27, 2018 10:29
-
-
Save wwj718/fe23bd773e815ecc81028ca6e43981cc to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import aiohttp | |
import asyncio | |
from requests_html import HTML | |
import tablib | |
import time | |
timestamp = int(time.time()) | |
# model | |
headers = ('url', 'area', 'telephone','address') # 字段 | |
data = [] | |
save_data = tablib.Dataset(*data, headers=headers) | |
def export_csv(data): | |
with open('test_{}.csv'.format(timestamp), 'wb') as f: | |
f.write(data.export('csv').encode("utf-8")) | |
async def fetch(session, url): | |
async with session.get(url) as response: | |
return await response.text() | |
def get_data(t_url,html,is_test=False): | |
global save_data | |
url = t_url | |
area = html.find('body > div.main-wrap > div.house-basic-info.clearfix > div.house-basic-right.fr > ul > li:nth-child(1) > span.house_basic_title_content_item2', first=True) | |
print("area:",area.full_text) | |
telephone = html.find('#houseChatEntry > div > p.phone-num', first=True) | |
print("telephone:",telephone.full_text) | |
# address | |
address = html.find("body > div.main-wrap > div.house-basic-info.clearfix > div.house-basic-right.fr > ul > li:nth-child(6)", first=True) | |
address_to_save = ' '.join(address.full_text.split()) | |
print("address:",address_to_save) # 合并空格 | |
# 楼层 | |
if is_test==False: | |
save_data.append((url, area.full_text, telephone.full_text,address_to_save)) | |
# body > div.main-wrap > div.house-basic-info.clearfix > div.house-basic-right.fr > ul > li:nth-child(4) > span.house_basic_title_content_item2 | |
async def get_target_page_data(session,t_url): | |
global bad_urls | |
print("t_url:",t_url) # http://sh.58.com/shangpu/34391724062893x.shtml | |
t_doc = await fetch(session, t_url) | |
t_html = HTML(html=t_doc) | |
try: | |
get_data(t_url, t_html) | |
except: | |
print("bad url:",t_url) # 收集下来,再一次 | |
bad_urls.append(t_url) | |
doc = "" | |
html = "" | |
t_html = "" | |
all_target_urls = [] | |
bad_urls = [] | |
async def main_get_target_urls(): | |
'''拿到商户主页''' | |
url_list = [ | |
# 黄埔 | |
"http://sh.58.com/huangpu/shangpucz/?area=1000_2000&sq=1&PGTID=0d306b35-0057-7134-a99d-27cf70aa6504&ClickID=6", # page1 | |
"http://sh.58.com/huangpu/shangpucz/pn2/?area=1000_2000&sq=1&PGTID=0d306b35-0057-7c3d-057d-33d6738862b6&ClickID=2", # page2 | |
# 静安 | |
"http://sh.58.com/jingan/shangpucz/?area=1000_2000&sq=1&PGTID=0d306b35-0057-7701-b196-1d9583e9c1ae&ClickID=2", # 1 | |
"http://sh.58.com/jingan/shangpucz/pn2/?area=1000_2000&sq=1&PGTID=0d306b35-0057-b5f9-6ebb-5140d63e954c&ClickID=2", # 2 | |
"http://sh.58.com/jingan/shangpucz/pn3/?area=1000_2000&sq=1&PGTID=0d306b35-0057-bfbc-e51b-f42963cd8e56&ClickID=2", # 3 | |
# 徐汇 | |
"http://sh.58.com/xuhui/shangpucz/?area=1000_2000&sq=1&PGTID=0d306b35-0057-bff2-6fc6-6030b511f494&ClickID=2", | |
"http://sh.58.com/xuhui/shangpucz/pn2/?area=1000_2000&sq=1&PGTID=0d306b35-0057-9297-bf16-230b883dc749&ClickID=2", | |
"http://sh.58.com/xuhui/shangpucz/pn3/?area=1000_2000&sq=1&PGTID=0d306b35-0057-9b8c-c748-8d19d69c83b8&ClickID=2", | |
] | |
global doc, html, t_html, all_target_urls # 用于debug | |
async with aiohttp.ClientSession() as session: | |
for url in url_list[:]: | |
# doc = await fetch(session, 'http://python.org') # 每次 存入 | |
doc = await fetch(session, url) | |
html = HTML(html=doc) | |
target_urls = [url for url in html.links if ("sh.58.com/shangpu" in url or "jxjump.58.com/service" in url)] | |
# 先拿到所有的 target_urls save下来 | |
print("url:{} len:{}".format(url, len(target_urls))) | |
for t_url in target_urls[:]: | |
all_target_urls.append(t_url) | |
# get_target_page_data(session,t_url) | |
async def main(urls): | |
'''获取主页信息,并存储''' | |
async with aiohttp.ClientSession() as session: | |
for t_url in urls: | |
await get_target_page_data(session,t_url) | |
def end_and_clearup(): | |
# 只需执行一次 | |
global all_target_urls , bad_urls | |
with open("bad_urls_{}.py".format(timestamp),"w") as bad_urls_file: | |
content = "bad_urls = " + str(bad_urls) | |
bad_urls_file.write(content) | |
if __name__ == '__main__': | |
import bad_urls_1528789277 | |
loop = asyncio.get_event_loop() | |
urls = bad_urls_1528789277.bad_urls # 从这里开始,之后每次使用bad_urls | |
loop.run_until_complete(main(urls)); | |
export_csv(save_data) | |
end_and_clearup() # 手动来做 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment