Last active
March 6, 2017 08:46
-
-
Save mylamour/4e6955c6d55386471499a5d11ec390e9 to your computer and use it in GitHub Desktop.
虽然我爬取了这个网站的数据,但我不认为这个网站的数据是真实的.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# coding:utf-8 | |
""" | |
this code make me look like a idiot. and the O(n^2) is not a good idea, we can use a dict to store the first file info,and just go | |
through the second file, clearly, it can be reduce about (m-n), also, if we use binaray search to O(nlogN), but in python , use dict | |
to do this ,it just only O(1) | |
info sturct like this: [{},{},{}] | |
i will optimized this code later and this code just want show how use the pandas dataframe to convert json to xlsx | |
""" | |
import json | |
from pandas import ExcelWriter,DataFrame | |
xlsx_writer = ExcelWriter('./doctor_henan.xlsx') | |
def store(info,filename): | |
import json | |
with open(filename,'a') as dest: | |
dest.write(info+ '\n') | |
info = [] | |
with open("./hospital_info_henan") as hjilin:#open("./doctor_info_jilin") as djilin: | |
for hosptial in hjilin.readlines(): | |
hosptials = json.loads(hosptial) | |
for _ in hosptials['all_faculty_info']: | |
with open("./doctor_info_henan") as djilin: | |
for doctor in djilin.readlines(): | |
try: | |
doctors = json.loads(doctor) | |
if doctors['faculty_id'] == _['special_id'] and doctors['docotor_info'] is not None: | |
for person in doctors['docotor_info']: | |
p_skill = [] | |
opt = { | |
u'医生姓名': person['name'], #.encode('utf8'), | |
u"职称": person['grade'], #.encode('utf8'), | |
u"等级": person['educateGrade'], #.encode('utf8'), | |
u"所属科室": _['faculty_name'], #.encode('utf8') , | |
u"所属医院": hosptials['hospital_name'], #.encode('utf8') | |
} | |
info.append(opt) | |
except Exception as e: | |
store(doctor,"./error_data") | |
pass | |
test_df = DataFrame(info) | |
test_df.to_excel(xlsx_writer,'Sheet1') | |
xlsx_writer.save() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# coding:utf-8 | |
import json | |
from urlparse import urlparse | |
doctor_touch_urlList = [] | |
hospital_urllist = [] | |
def get_doctor_count(doctors_info_file): | |
with open(doctors_info_file) as d: | |
origin_count = 0 | |
for _ in d.readlines(): | |
test = json.loads(_) | |
origin_count = origin_count + len(test['docotor_info']) | |
if test['docotor_info'] is not None: | |
for s_url in test['docotor_info']: | |
doctor_touch_urlList.append(s_url['touchUrl']) | |
docotors = list(set(doctor_touch_urlList)) | |
return origin_count , len(docotors) | |
def get_hospital_count(hosptials_info_file): | |
with open(hosptials_info_file) as h: | |
origin_count = 0 | |
for _ in h.readlines(): | |
origin_count = origin_count + 1 | |
test = json.loads(_) | |
hospital_urllist.append(test['hospital_homepage']) | |
# may be you need change the keyword, "hospital_homepage" to "hospital_url" | |
# another reason is there was some spelling failure | |
hospitals = list(set(hospital_urllist)) | |
return origin_count,len(hospitals) | |
o_h,n_h = get_hospital_count('hosptial_info_jilin') | |
o_d,n_d = get_doctor_count('doctor_info_jilin') | |
print "origin_hospital: " + str(o_h) + " unique_hospital: " + str(n_h) + '\n' + "origin_doctor: " + str(o_d) + " unique_doctor: " + str(n_d) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# coding=utf-8 | |
import requests,json | |
import time,random | |
import sys | |
reload(sys) | |
sys.setdefaultencoding('utf-8') | |
headers = { | |
'host': "m.haodf.com", | |
'accept': "*/*", | |
'x-requested-with': "XMLHttpRequest", | |
'user-agent': "Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1", | |
'referer': "http://m.haodf.com/touch/faculty/DE4roiYGYZw0GIaCFVHDuJVht.htm", | |
'accept-encoding': "gzip, deflate, sdch", | |
'accept-language': "zh-CN,zh;q=0.8", | |
'cookie': "NAVTOUCH=1; g=HDF.112.58abebaaf0985; Hm_lvt_dfa5478034171cc641b1639b2a5b717d=1487681206; _ga=GA1.2.2089374926.1487681207; newaskindex=1", | |
'connection': "close", | |
'cache-control': "no-cache" | |
} | |
def store(info,filename): | |
import json | |
with open(filename,'a') as dest: | |
dest.write(json.dumps(info, ensure_ascii=False).encode('utf8') + '\n') | |
# 思路受限了,应该直接content内容是否为空,无需考虑是否标志位为空 | |
def get_doctor_info(faculty_id,p=0): | |
info={} | |
contents = [] | |
info[u'docotor_info'] = contents | |
info[u'faculty_id'] = faculty_id | |
url = "http://m.haodf.com/touch/faculty/loaddoctors/"+ faculty_id | |
querystring = {"caseorphone":"0","booking":"0","p":p} | |
response = requests.request("GET", url, headers=headers, params=querystring) | |
docotor_info = json.loads(response.text) | |
if docotor_info[u'contents'] is not None: | |
for _ in docotor_info[u'contents']: | |
info[u'docotor_info'].append(_) | |
print info | |
return info | |
else: | |
return None | |
def get_factor_doctor_info(faculty_id,p=0): | |
test = get_doctor_info(faculty_id,p) | |
if get_doctor_info(faculty_id,p) is None: | |
return | |
else: | |
map(lambda x :test['docotor_info'].append(x),get_doctor_info(faculty_id,p+1)['docotor_info']) | |
return store(test,'doctor_test') | |
# return store(test,faculty_id) | |
with open('./faculty_id') as f: | |
for _ in f.readlines(): | |
f_id = _.strip('\r\n') | |
try: | |
get_factor_doctor_info(f_id) | |
time.sleep(random.random()) #0,1sec | |
except Exception as e: | |
time.sleep(3*60) | |
get_factor_doctor_info(f_id) | |
# get_doctor_info("DE4r0eJWGqZNZYiaYhI2QCo55twNygxV") | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var webPage = require('webpage'); | |
var fs = require('fs'); | |
var puredocid = fs.open('./testpuredocid', 'r'); | |
var homepage = fs.open('./doc_homepage_url','+'); | |
var preffix = 'http://www.haodf.com/doctor/'; | |
var idlist = []; | |
var urllist = []; | |
while(!puredocid.atEnd()) { | |
var id = puredocid.readLine(); | |
var url = preffix + id + '.htm'; | |
urllist.push(url); | |
idlist.push(id); | |
} | |
puredocid.close(); | |
function process(){ | |
if (urllist.length == 0){ | |
phantom.exit(); | |
} else{ | |
url = urllist.pop(); | |
page = require('webpage').create(); | |
page.open(url, onFinishedLoading) | |
} | |
} | |
function onFinishedLoading(status){ | |
var currentUrl = page.evaluate(function() { | |
return [].map.call(document.querySelectorAll('#bp_doctor_about > div > div.middletr > div > div.doctor-home-page.clearfix > span:nth-child(3) > a'), function(link) { | |
return link.getAttribute('href'); | |
}); | |
}); | |
if (currentUrl != "" ){ | |
console.log(currentUrl); | |
homepage.writeLine(currentUrl); //don't write to file and i don't know the reason | |
} | |
page.release(); | |
process(); | |
} | |
process(); | |
// this script exhaust lot time, feeling not good |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# coding: utf-8 | |
import requests,json | |
from urlparse import urlparse | |
from bs4 import BeautifulSoup | |
from docopt import docopt | |
import lxml | |
def get_hospital_office(hospital_office): | |
hospital = [] | |
office = [] | |
for _ in hospital_office: | |
h = _.text.split(" ")[0] | |
o = _.text.split(" ")[1] | |
hospital.append(h) | |
office.append(o) | |
return hospital, office | |
def parse_artical_info(artical_info): | |
a_a = {} | |
o_o = [] | |
for _ in artical_info : | |
try: | |
fl = _.text.split().index("人已读") | |
a_a['artical_catalog'] = _.text.split()[0].strip("[]") | |
a_a['artical_title'] = _.text.split()[1] | |
a_a['read'] = _.text.split()[fl-1] | |
artical_time = _.text.split()[fl+1] | |
a_a['artical_time'] = artical_time.strip("发表于") | |
a_a['origin_info'] = _.text.split() #原始文章数据备份,可自行处理 | |
o_o.append(a_a.copy()) | |
except Exception as e: | |
pass | |
return o_o | |
def get_artical_info_common(homepage,uname,page_count=1): | |
lanmu_url = "http://" + homepage + "/lanmu" | |
lanmu = requests.get(lanmu_url+"_"+str(page_count),headers=header) | |
artical_soup = BeautifulSoup(lanmu.text,"lxml") | |
check = "http://"+homepage+"/api/article/ajaxcategorylist?uname=" + uname | |
flag = requests.get(check,headers=header) | |
if len(flag.text) > 47: | |
artical_info = artical_soup.select("ul.article_ul > li > div.clearfix") | |
a_Oinfo = parse_artical_info(artical_info) | |
try: | |
all_page_count = artical_soup.select("a.page_turn_a")[0].text.strip() | |
if all_page_count and all_page_count >= page_count: | |
for page_number in range(2,int(all_page_count)+1): | |
page_html = requests.get(lanmu_url+"_"+str(page_number),headers=header) | |
articalsoup = BeautifulSoup(page_html.text,"lxml") | |
artical_info = articalsoup.select("ul.article_ul > li > div.clearfix") | |
a_Oinfo.append(parse_artical_info(artical_info)) | |
except Exception as e: | |
pass | |
return a_Oinfo | |
else: | |
return | |
def get_doctor_info(doctor_info,hospital_office): | |
doctor_name = [] | |
doctor_hompage =[] | |
doctor_artical_url = [] | |
doctor_skillful = [] | |
doctor_experience = [] | |
artical_info = [] | |
hospital, office = get_hospital_office(hospital_office) | |
for _ in doctor_info: | |
url = urlparse(_['href']) | |
homepage = url.netloc | |
artical_url = homepage+"/lanmu" | |
uname = homepage.split(".")[0] | |
querystring = {"uname":uname} | |
detail_url = "http://" + homepage + "/api/index/ajaxdoctorintro" | |
details = requests.get(detail_url, headers = header,params=querystring) | |
dsoup = BeautifulSoup(details.text,'lxml') | |
skillful = dsoup.select(".pb15.bbd_e9 > p.hh") | |
experience = dsoup.select(".pt15 > p.hh") | |
# artical_info = get_artical_info(artical_url) | |
doctor_name.append(_.text.split()) | |
doctor_hompage.append(homepage.split()) | |
doctor_artical_url.append(artical_url.split()) | |
doctor_skillful.append(skillful[0].text.split()) | |
doctor_experience.append(experience[0].text.split()) | |
t = get_artical_info_common(homepage,uname) | |
artical_info.append(t) | |
allinfo = zip(doctor_name,hospital,office,doctor_hompage,doctor_artical_url,doctor_skillful,doctor_experience,artical_info) | |
store(allinfo,'ZheShiJiaDeHaodaifu') | |
def store(info,filename): | |
import json | |
with open(filename,'a') as dest: | |
dest.write(json.dumps(info, ensure_ascii=False).encode('utf8') + '\n') | |
# if something was wrong, just use this to make a simple test | |
def simple_test(testurl): | |
print testurl | |
r = requests.get(testurl,headers=header) | |
mainsoup = BeautifulSoup(r.text,"lxml") | |
doctor_info= mainsoup.select(".fl > .mt5 > a") | |
hospital_office = mainsoup.select(".fr > .fb") | |
get_doctor_info(doctor_info,hospital_office) | |
def spider_worker(urllist): | |
for url in urllist: | |
try : | |
import time,random | |
time.sleep(int(random.uniform(3, 7))) | |
r = requests.get(url,headers=header) | |
except Exception as e: | |
time.sleep(60*15) # also you can use the generator to make a proxy yield or a middleware | |
r = requests.get(url,headers=header) | |
mainsoup = BeautifulSoup(r.text,"lxml") | |
doctor_info= mainsoup.select(".fl > .mt5 > a") | |
hospital_office = mainsoup.select(".fr > .fb") | |
try : | |
get_doctor_info(doctor_info,hospital_office) | |
except Exception as e: | |
time.sleep(60*15) | |
get_doctor_info(doctor_info,hospital_office) | |
if __name__ == '__main__': | |
import sys | |
reload(sys) | |
sys.setdefaultencoding('utf-8') | |
arguments = docopt (__doc__,version= 'haodaifu spider 0.1') | |
print arguments | |
header = { | |
"Accept":"*/*", | |
"Accept-Encoding":"gzip, deflate, sdch, br", | |
"Accept-Language":"zh-CN,zh;q=0.8", | |
"Connection":"keep-alive", | |
"Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", | |
"Cookie":"g=HDF.96.58a65f5833eab", | |
"If-Modified-Since":"Fri, 17 Feb 2017 04:23:52 GMT", | |
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36" | |
} | |
# start_url = 'http://400.haodf.com/index/search?diseasename=&province=&facultyname=&hosfaculty=&hospitalname=&nowpage=' | |
# test_url = 'http://400.haodf.com/index/search?diseasename=&province=&facultyname=&hosfaculty=&hospitalname=&nowpage=9' | |
urllist = [start_url + str(i) for i in range(1,3911)] | |
spider_worker(urllist) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# coding=utf-8 | |
import requests,json | |
from urlparse import urlparse | |
from bs4 import BeautifulSoup | |
import lxml | |
#最开始打算把areaurlist 作为全局变量。并在make_url中返回该数据,觉得不好看,所以将url序列抽出后存了单独的txt。 | |
import sys | |
reload(sys) | |
sys.setdefaultencoding('utf-8') | |
headers = { | |
'host': "www.haodf.com", | |
'connection': "keep-alive", | |
'cache-control': "no-cache", | |
'upgrade-insecure-requests': "1", | |
'user-agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36", | |
'accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", | |
'accept-encoding': "gzip, deflate, sdch", | |
'accept-language': "zh-CN,zh;q=0.8", | |
'cookie': "g=40982_1487298388760; __ads_session=fS8tLPu23Qh88vQLKgA=; NAVTOUCH=1; g=HDF.112.58abde6b2b204" | |
} | |
# use hospital_special_id to get | |
def make_url(start_url): | |
hospital_url_list = [] | |
hospital_name_list = [] | |
hospital_intro_url_list= [] | |
hospital_special_id_list = [] | |
test_hospital = requests.get(start_url, headers=headers) | |
h_soup = BeautifulSoup(test_hospital.text,'lxml') | |
start = h_soup.select('.m_ctt_green a') | |
for _ in start: | |
hospital_name_list.append(_.text) | |
try: | |
special_hospital_id = _['href'].split("/")[2].strip(".htm") | |
hospital_url = "http://www.haodf.com/hospital"+ _['href'] #just store homepage | |
hospital_intro_url = "http://info.haodf.com/hospital/"+ special_hospital_id + "/jieshao.htm" | |
except KeyError,IndexError: | |
pass | |
hospital_url_list.append(hospital_url) | |
hospital_intro_url_list.append(hospital_intro_url) | |
hospital_special_id_list.append(special_hospital_id) | |
return hospital_name_list,hospital_url_list,hospital_intro_url_list,hospital_special_id_list | |
#return hospital intro and faculty info | |
def get_hospital_info(hospital_url): | |
hinfo = requests.get(hospital_url,headers=headers) | |
info_soup = BeautifulSoup(hinfo.text,'lxml') | |
try: | |
res = info_soup.select('.czsj td')[0].text | |
except IndexError as e: | |
res = "" | |
pass | |
return res | |
# get a single faculty info | |
def get_faculty_info(faculty_special_id): | |
faculty_url = "http://www.haodf.com/faculty/" + faculty_special_id + "/jieshao.htm" | |
hospital_faculty = requests.get(faculty_url, headers=headers) | |
faculty_soup = BeautifulSoup(hospital_faculty.text,'lxml') | |
try: | |
faculty_detail = faculty_soup.select('#about_det')[0].text | |
except IndexError as e: | |
faculty_detail = "" | |
pass | |
return faculty_detail | |
#get a hostoital all faculty url like: http://www.haodf.com/hospital/DE4raCNSz6OmG3OUNZWCWNv0/keshi.htm | |
def get_hospital_faculty(hospital_special_id): | |
url = "http://www.haodf.com/hospital/ " + hospital_special_id + "/keshi.htm" | |
hospital_faculty_all = [] | |
faculty_info={} | |
tmp = requests.get(url,headers=headers) | |
t_soup = BeautifulSoup(tmp.text,'lxml') | |
faculty_list = t_soup.select('.bluepanel .blue') | |
for _ in faculty_list: | |
faculty_info['faculty_name'] = _.text | |
faculty_info['url'] = _['href'] | |
faculty_info['special_id'] = urlparse(_['href']).path.split('/')[2].split('.')[0] | |
hospital_faculty_all.append(faculty_info.copy()) | |
return hospital_faculty_all | |
def one_hospital(hospital_name_list,hospital_url_list,hospital_intro_url_list,hospital_special_id_list): | |
info = {} | |
for _ in zip(hospital_name_list,hospital_url_list,hospital_intro_url_list,hospital_special_id_list): | |
info['hospital_name'] = _[0] | |
info['hospital_homepage'] =_[1] | |
try: | |
info['hospital_detial'] = get_hospital_info(_[2]) | |
faculty_info = get_hospital_faculty(_[3]) | |
all_faculty_info = [] | |
except Exception as e: | |
import time | |
time.sleep(3*60) | |
info['hospital_detial'] = get_hospital_info(_[2]) | |
faculty_info = get_hospital_faculty(_[3]) | |
all_faculty_info = [] | |
for _ in faculty_info: | |
try: | |
single_info= get_faculty_info(_['special_id']) | |
_['faculty_info'] = single_info | |
all_faculty_info.append(_) | |
except Exception as e: | |
import time | |
time.sleep(3*60) | |
single_info= get_faculty_info(_['special_id']) | |
_['faculty_info'] = single_info | |
all_faculty_info.append(_) | |
info['all_faculty_info'] = all_faculty_info | |
store(info,"./Info") | |
def store(info,filename): | |
import json | |
with open(filename,'a') as dest: | |
dest.write(json.dumps(info, ensure_ascii=False).encode('utf8') + '\n') | |
#done without try catch | |
def test_get_hospital_faculty(): | |
test = get_hospital_faculty("http://www.haodf.com/hospital/DE4raCNSz6OmG3OUNZWCWNv0/keshi.htm") | |
for _ in test: | |
print _ | |
return test | |
# done faculty_special_id from get_hospital_faculty | |
def test_get_faculty_info(): | |
test = get_faculty_info("DE4r0BCkuHzduSnKssvRDydDzVOlL") | |
print test_hospital | |
# done with hospital_info | |
def test_get_hospital_info(): | |
test = get_hospital_info("http://www.haodf.com/faculty/DE4roiYGYZw0JOrEpjdCy8jrf/jieshao.htm") | |
print test | |
#done , can get a hospital all info, but have a little problem with different platform | |
def test_one_hospital(url): | |
a,s,d,f = make_url(url) | |
one_hospital(a,s,d,f) | |
# area_url.txt是抽取了主页的list得到的(稍微改一下make_url就行) | |
with open('./area_url.txt') as area_list: | |
for area in area_list.readlines(): | |
print "Now , we process in :" + area.strip('\n') | |
test_one_hospital(area.strip('\r\n')) | |
#万万没想到是\r惹的祸 | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment