Skip to content

Instantly share code, notes, and snippets.

@wwj718
Created September 27, 2018 10:29
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save wwj718/fe23bd773e815ecc81028ca6e43981cc to your computer and use it in GitHub Desktop.
Save wwj718/fe23bd773e815ecc81028ca6e43981cc to your computer and use it in GitHub Desktop.
import aiohttp
import asyncio
from requests_html import HTML
import tablib
import time
timestamp = int(time.time())
# model
headers = ('url', 'area', 'telephone','address') # 字段
data = []
save_data = tablib.Dataset(*data, headers=headers)
def export_csv(data):
with open('test_{}.csv'.format(timestamp), 'wb') as f:
f.write(data.export('csv').encode("utf-8"))
async def fetch(session, url):
async with session.get(url) as response:
return await response.text()
def get_data(t_url,html,is_test=False):
global save_data
url = t_url
area = html.find('body > div.main-wrap > div.house-basic-info.clearfix > div.house-basic-right.fr > ul > li:nth-child(1) > span.house_basic_title_content_item2', first=True)
print("area:",area.full_text)
telephone = html.find('#houseChatEntry > div > p.phone-num', first=True)
print("telephone:",telephone.full_text)
# address
address = html.find("body > div.main-wrap > div.house-basic-info.clearfix > div.house-basic-right.fr > ul > li:nth-child(6)", first=True)
address_to_save = ' '.join(address.full_text.split())
print("address:",address_to_save) # 合并空格
# 楼层
if is_test==False:
save_data.append((url, area.full_text, telephone.full_text,address_to_save))
# body > div.main-wrap > div.house-basic-info.clearfix > div.house-basic-right.fr > ul > li:nth-child(4) > span.house_basic_title_content_item2
async def get_target_page_data(session,t_url):
global bad_urls
print("t_url:",t_url) # http://sh.58.com/shangpu/34391724062893x.shtml
t_doc = await fetch(session, t_url)
t_html = HTML(html=t_doc)
try:
get_data(t_url, t_html)
except:
print("bad url:",t_url) # 收集下来,再一次
bad_urls.append(t_url)
doc = ""
html = ""
t_html = ""
all_target_urls = []
bad_urls = []
async def main_get_target_urls():
'''拿到商户主页'''
url_list = [
# 黄埔
"http://sh.58.com/huangpu/shangpucz/?area=1000_2000&sq=1&PGTID=0d306b35-0057-7134-a99d-27cf70aa6504&ClickID=6", # page1
"http://sh.58.com/huangpu/shangpucz/pn2/?area=1000_2000&sq=1&PGTID=0d306b35-0057-7c3d-057d-33d6738862b6&ClickID=2", # page2
# 静安
"http://sh.58.com/jingan/shangpucz/?area=1000_2000&sq=1&PGTID=0d306b35-0057-7701-b196-1d9583e9c1ae&ClickID=2", # 1
"http://sh.58.com/jingan/shangpucz/pn2/?area=1000_2000&sq=1&PGTID=0d306b35-0057-b5f9-6ebb-5140d63e954c&ClickID=2", # 2
"http://sh.58.com/jingan/shangpucz/pn3/?area=1000_2000&sq=1&PGTID=0d306b35-0057-bfbc-e51b-f42963cd8e56&ClickID=2", # 3
# 徐汇
"http://sh.58.com/xuhui/shangpucz/?area=1000_2000&sq=1&PGTID=0d306b35-0057-bff2-6fc6-6030b511f494&ClickID=2",
"http://sh.58.com/xuhui/shangpucz/pn2/?area=1000_2000&sq=1&PGTID=0d306b35-0057-9297-bf16-230b883dc749&ClickID=2",
"http://sh.58.com/xuhui/shangpucz/pn3/?area=1000_2000&sq=1&PGTID=0d306b35-0057-9b8c-c748-8d19d69c83b8&ClickID=2",
]
global doc, html, t_html, all_target_urls # 用于debug
async with aiohttp.ClientSession() as session:
for url in url_list[:]:
# doc = await fetch(session, 'http://python.org') # 每次 存入
doc = await fetch(session, url)
html = HTML(html=doc)
target_urls = [url for url in html.links if ("sh.58.com/shangpu" in url or "jxjump.58.com/service" in url)]
# 先拿到所有的 target_urls save下来
print("url:{} len:{}".format(url, len(target_urls)))
for t_url in target_urls[:]:
all_target_urls.append(t_url)
# get_target_page_data(session,t_url)
async def main(urls):
'''获取主页信息,并存储'''
async with aiohttp.ClientSession() as session:
for t_url in urls:
await get_target_page_data(session,t_url)
def end_and_clearup():
# 只需执行一次
global all_target_urls , bad_urls
with open("bad_urls_{}.py".format(timestamp),"w") as bad_urls_file:
content = "bad_urls = " + str(bad_urls)
bad_urls_file.write(content)
if __name__ == '__main__':
import bad_urls_1528789277
loop = asyncio.get_event_loop()
urls = bad_urls_1528789277.bad_urls # 从这里开始,之后每次使用bad_urls
loop.run_until_complete(main(urls));
export_csv(save_data)
end_and_clearup() # 手动来做
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment