Skip to content

Instantly share code, notes, and snippets.

@mylamour
Last active March 6, 2017 08:46
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mylamour/4e6955c6d55386471499a5d11ec390e9 to your computer and use it in GitHub Desktop.
Save mylamour/4e6955c6d55386471499a5d11ec390e9 to your computer and use it in GitHub Desktop.
虽然我爬取了这个网站的数据,但我不认为这个网站的数据是真实的.
#!/usr/bin/env python
# coding:utf-8
"""
this code make me look like a idiot. and the O(n^2) is not a good idea, we can use a dict to store the first file info,and just go
through the second file, clearly, it can be reduce about (m-n), also, if we use binaray search to O(nlogN), but in python , use dict
to do this ,it just only O(1)
info sturct like this: [{},{},{}]
i will optimized this code later and this code just want show how use the pandas dataframe to convert json to xlsx
"""
import json
from pandas import ExcelWriter,DataFrame
xlsx_writer = ExcelWriter('./doctor_henan.xlsx')
def store(info,filename):
import json
with open(filename,'a') as dest:
dest.write(info+ '\n')
info = []
with open("./hospital_info_henan") as hjilin:#open("./doctor_info_jilin") as djilin:
for hosptial in hjilin.readlines():
hosptials = json.loads(hosptial)
for _ in hosptials['all_faculty_info']:
with open("./doctor_info_henan") as djilin:
for doctor in djilin.readlines():
try:
doctors = json.loads(doctor)
if doctors['faculty_id'] == _['special_id'] and doctors['docotor_info'] is not None:
for person in doctors['docotor_info']:
p_skill = []
opt = {
u'医生姓名': person['name'], #.encode('utf8'),
u"职称": person['grade'], #.encode('utf8'),
u"等级": person['educateGrade'], #.encode('utf8'),
u"所属科室": _['faculty_name'], #.encode('utf8') ,
u"所属医院": hosptials['hospital_name'], #.encode('utf8')
}
info.append(opt)
except Exception as e:
store(doctor,"./error_data")
pass
test_df = DataFrame(info)
test_df.to_excel(xlsx_writer,'Sheet1')
xlsx_writer.save()
#!/usr/bin/env python
# coding:utf-8
import json
from urlparse import urlparse
doctor_touch_urlList = []
hospital_urllist = []
def get_doctor_count(doctors_info_file):
with open(doctors_info_file) as d:
origin_count = 0
for _ in d.readlines():
test = json.loads(_)
origin_count = origin_count + len(test['docotor_info'])
if test['docotor_info'] is not None:
for s_url in test['docotor_info']:
doctor_touch_urlList.append(s_url['touchUrl'])
docotors = list(set(doctor_touch_urlList))
return origin_count , len(docotors)
def get_hospital_count(hosptials_info_file):
with open(hosptials_info_file) as h:
origin_count = 0
for _ in h.readlines():
origin_count = origin_count + 1
test = json.loads(_)
hospital_urllist.append(test['hospital_homepage'])
# may be you need change the keyword, "hospital_homepage" to "hospital_url"
# another reason is there was some spelling failure
hospitals = list(set(hospital_urllist))
return origin_count,len(hospitals)
o_h,n_h = get_hospital_count('hosptial_info_jilin')
o_d,n_d = get_doctor_count('doctor_info_jilin')
print "origin_hospital: " + str(o_h) + " unique_hospital: " + str(n_h) + '\n' + "origin_doctor: " + str(o_d) + " unique_doctor: " + str(n_d)
#!/usr/bin/env python
# coding=utf-8
import requests,json
import time,random
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
headers = {
'host': "m.haodf.com",
'accept': "*/*",
'x-requested-with': "XMLHttpRequest",
'user-agent': "Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1",
'referer': "http://m.haodf.com/touch/faculty/DE4roiYGYZw0GIaCFVHDuJVht.htm",
'accept-encoding': "gzip, deflate, sdch",
'accept-language': "zh-CN,zh;q=0.8",
'cookie': "NAVTOUCH=1; g=HDF.112.58abebaaf0985; Hm_lvt_dfa5478034171cc641b1639b2a5b717d=1487681206; _ga=GA1.2.2089374926.1487681207; newaskindex=1",
'connection': "close",
'cache-control': "no-cache"
}
def store(info,filename):
import json
with open(filename,'a') as dest:
dest.write(json.dumps(info, ensure_ascii=False).encode('utf8') + '\n')
# 思路受限了,应该直接content内容是否为空,无需考虑是否标志位为空
def get_doctor_info(faculty_id,p=0):
info={}
contents = []
info[u'docotor_info'] = contents
info[u'faculty_id'] = faculty_id
url = "http://m.haodf.com/touch/faculty/loaddoctors/"+ faculty_id
querystring = {"caseorphone":"0","booking":"0","p":p}
response = requests.request("GET", url, headers=headers, params=querystring)
docotor_info = json.loads(response.text)
if docotor_info[u'contents'] is not None:
for _ in docotor_info[u'contents']:
info[u'docotor_info'].append(_)
print info
return info
else:
return None
def get_factor_doctor_info(faculty_id,p=0):
test = get_doctor_info(faculty_id,p)
if get_doctor_info(faculty_id,p) is None:
return
else:
map(lambda x :test['docotor_info'].append(x),get_doctor_info(faculty_id,p+1)['docotor_info'])
return store(test,'doctor_test')
# return store(test,faculty_id)
with open('./faculty_id') as f:
for _ in f.readlines():
f_id = _.strip('\r\n')
try:
get_factor_doctor_info(f_id)
time.sleep(random.random()) #0,1sec
except Exception as e:
time.sleep(3*60)
get_factor_doctor_info(f_id)
# get_doctor_info("DE4r0eJWGqZNZYiaYhI2QCo55twNygxV")
var webPage = require('webpage');
var fs = require('fs');
var puredocid = fs.open('./testpuredocid', 'r');
var homepage = fs.open('./doc_homepage_url','+');
var preffix = 'http://www.haodf.com/doctor/';
var idlist = [];
var urllist = [];
while(!puredocid.atEnd()) {
var id = puredocid.readLine();
var url = preffix + id + '.htm';
urllist.push(url);
idlist.push(id);
}
puredocid.close();
function process(){
if (urllist.length == 0){
phantom.exit();
} else{
url = urllist.pop();
page = require('webpage').create();
page.open(url, onFinishedLoading)
}
}
function onFinishedLoading(status){
var currentUrl = page.evaluate(function() {
return [].map.call(document.querySelectorAll('#bp_doctor_about > div > div.middletr > div > div.doctor-home-page.clearfix > span:nth-child(3) > a'), function(link) {
return link.getAttribute('href');
});
});
if (currentUrl != "" ){
console.log(currentUrl);
homepage.writeLine(currentUrl); //don't write to file and i don't know the reason
}
page.release();
process();
}
process();
// this script exhaust lot time, feeling not good
#!/usr/bin/env python
# coding: utf-8
import requests,json
from urlparse import urlparse
from bs4 import BeautifulSoup
from docopt import docopt
import lxml
def get_hospital_office(hospital_office):
hospital = []
office = []
for _ in hospital_office:
h = _.text.split(" ")[0]
o = _.text.split(" ")[1]
hospital.append(h)
office.append(o)
return hospital, office
def parse_artical_info(artical_info):
a_a = {}
o_o = []
for _ in artical_info :
try:
fl = _.text.split().index("人已读")
a_a['artical_catalog'] = _.text.split()[0].strip("[]")
a_a['artical_title'] = _.text.split()[1]
a_a['read'] = _.text.split()[fl-1]
artical_time = _.text.split()[fl+1]
a_a['artical_time'] = artical_time.strip("发表于")
a_a['origin_info'] = _.text.split() #原始文章数据备份,可自行处理
o_o.append(a_a.copy())
except Exception as e:
pass
return o_o
def get_artical_info_common(homepage,uname,page_count=1):
lanmu_url = "http://" + homepage + "/lanmu"
lanmu = requests.get(lanmu_url+"_"+str(page_count),headers=header)
artical_soup = BeautifulSoup(lanmu.text,"lxml")
check = "http://"+homepage+"/api/article/ajaxcategorylist?uname=" + uname
flag = requests.get(check,headers=header)
if len(flag.text) > 47:
artical_info = artical_soup.select("ul.article_ul > li > div.clearfix")
a_Oinfo = parse_artical_info(artical_info)
try:
all_page_count = artical_soup.select("a.page_turn_a")[0].text.strip()
if all_page_count and all_page_count >= page_count:
for page_number in range(2,int(all_page_count)+1):
page_html = requests.get(lanmu_url+"_"+str(page_number),headers=header)
articalsoup = BeautifulSoup(page_html.text,"lxml")
artical_info = articalsoup.select("ul.article_ul > li > div.clearfix")
a_Oinfo.append(parse_artical_info(artical_info))
except Exception as e:
pass
return a_Oinfo
else:
return
def get_doctor_info(doctor_info,hospital_office):
doctor_name = []
doctor_hompage =[]
doctor_artical_url = []
doctor_skillful = []
doctor_experience = []
artical_info = []
hospital, office = get_hospital_office(hospital_office)
for _ in doctor_info:
url = urlparse(_['href'])
homepage = url.netloc
artical_url = homepage+"/lanmu"
uname = homepage.split(".")[0]
querystring = {"uname":uname}
detail_url = "http://" + homepage + "/api/index/ajaxdoctorintro"
details = requests.get(detail_url, headers = header,params=querystring)
dsoup = BeautifulSoup(details.text,'lxml')
skillful = dsoup.select(".pb15.bbd_e9 > p.hh")
experience = dsoup.select(".pt15 > p.hh")
# artical_info = get_artical_info(artical_url)
doctor_name.append(_.text.split())
doctor_hompage.append(homepage.split())
doctor_artical_url.append(artical_url.split())
doctor_skillful.append(skillful[0].text.split())
doctor_experience.append(experience[0].text.split())
t = get_artical_info_common(homepage,uname)
artical_info.append(t)
allinfo = zip(doctor_name,hospital,office,doctor_hompage,doctor_artical_url,doctor_skillful,doctor_experience,artical_info)
store(allinfo,'ZheShiJiaDeHaodaifu')
def store(info,filename):
import json
with open(filename,'a') as dest:
dest.write(json.dumps(info, ensure_ascii=False).encode('utf8') + '\n')
# if something was wrong, just use this to make a simple test
def simple_test(testurl):
print testurl
r = requests.get(testurl,headers=header)
mainsoup = BeautifulSoup(r.text,"lxml")
doctor_info= mainsoup.select(".fl > .mt5 > a")
hospital_office = mainsoup.select(".fr > .fb")
get_doctor_info(doctor_info,hospital_office)
def spider_worker(urllist):
for url in urllist:
try :
import time,random
time.sleep(int(random.uniform(3, 7)))
r = requests.get(url,headers=header)
except Exception as e:
time.sleep(60*15) # also you can use the generator to make a proxy yield or a middleware
r = requests.get(url,headers=header)
mainsoup = BeautifulSoup(r.text,"lxml")
doctor_info= mainsoup.select(".fl > .mt5 > a")
hospital_office = mainsoup.select(".fr > .fb")
try :
get_doctor_info(doctor_info,hospital_office)
except Exception as e:
time.sleep(60*15)
get_doctor_info(doctor_info,hospital_office)
if __name__ == '__main__':
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
arguments = docopt (__doc__,version= 'haodaifu spider 0.1')
print arguments
header = {
"Accept":"*/*",
"Accept-Encoding":"gzip, deflate, sdch, br",
"Accept-Language":"zh-CN,zh;q=0.8",
"Connection":"keep-alive",
"Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Cookie":"g=HDF.96.58a65f5833eab",
"If-Modified-Since":"Fri, 17 Feb 2017 04:23:52 GMT",
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36"
}
# start_url = 'http://400.haodf.com/index/search?diseasename=&province=&facultyname=&hosfaculty=&hospitalname=&nowpage='
# test_url = 'http://400.haodf.com/index/search?diseasename=&province=&facultyname=&hosfaculty=&hospitalname=&nowpage=9'
urllist = [start_url + str(i) for i in range(1,3911)]
spider_worker(urllist)
#!/usr/bin/env python
# coding=utf-8
import requests,json
from urlparse import urlparse
from bs4 import BeautifulSoup
import lxml
#最开始打算把areaurlist 作为全局变量。并在make_url中返回该数据,觉得不好看,所以将url序列抽出后存了单独的txt。
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
headers = {
'host': "www.haodf.com",
'connection': "keep-alive",
'cache-control': "no-cache",
'upgrade-insecure-requests': "1",
'user-agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36",
'accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
'accept-encoding': "gzip, deflate, sdch",
'accept-language': "zh-CN,zh;q=0.8",
'cookie': "g=40982_1487298388760; __ads_session=fS8tLPu23Qh88vQLKgA=; NAVTOUCH=1; g=HDF.112.58abde6b2b204"
}
# use hospital_special_id to get
def make_url(start_url):
hospital_url_list = []
hospital_name_list = []
hospital_intro_url_list= []
hospital_special_id_list = []
test_hospital = requests.get(start_url, headers=headers)
h_soup = BeautifulSoup(test_hospital.text,'lxml')
start = h_soup.select('.m_ctt_green a')
for _ in start:
hospital_name_list.append(_.text)
try:
special_hospital_id = _['href'].split("/")[2].strip(".htm")
hospital_url = "http://www.haodf.com/hospital"+ _['href'] #just store homepage
hospital_intro_url = "http://info.haodf.com/hospital/"+ special_hospital_id + "/jieshao.htm"
except KeyError,IndexError:
pass
hospital_url_list.append(hospital_url)
hospital_intro_url_list.append(hospital_intro_url)
hospital_special_id_list.append(special_hospital_id)
return hospital_name_list,hospital_url_list,hospital_intro_url_list,hospital_special_id_list
#return hospital intro and faculty info
def get_hospital_info(hospital_url):
hinfo = requests.get(hospital_url,headers=headers)
info_soup = BeautifulSoup(hinfo.text,'lxml')
try:
res = info_soup.select('.czsj td')[0].text
except IndexError as e:
res = ""
pass
return res
# get a single faculty info
def get_faculty_info(faculty_special_id):
faculty_url = "http://www.haodf.com/faculty/" + faculty_special_id + "/jieshao.htm"
hospital_faculty = requests.get(faculty_url, headers=headers)
faculty_soup = BeautifulSoup(hospital_faculty.text,'lxml')
try:
faculty_detail = faculty_soup.select('#about_det')[0].text
except IndexError as e:
faculty_detail = ""
pass
return faculty_detail
#get a hostoital all faculty url like: http://www.haodf.com/hospital/DE4raCNSz6OmG3OUNZWCWNv0/keshi.htm
def get_hospital_faculty(hospital_special_id):
url = "http://www.haodf.com/hospital/ " + hospital_special_id + "/keshi.htm"
hospital_faculty_all = []
faculty_info={}
tmp = requests.get(url,headers=headers)
t_soup = BeautifulSoup(tmp.text,'lxml')
faculty_list = t_soup.select('.bluepanel .blue')
for _ in faculty_list:
faculty_info['faculty_name'] = _.text
faculty_info['url'] = _['href']
faculty_info['special_id'] = urlparse(_['href']).path.split('/')[2].split('.')[0]
hospital_faculty_all.append(faculty_info.copy())
return hospital_faculty_all
def one_hospital(hospital_name_list,hospital_url_list,hospital_intro_url_list,hospital_special_id_list):
info = {}
for _ in zip(hospital_name_list,hospital_url_list,hospital_intro_url_list,hospital_special_id_list):
info['hospital_name'] = _[0]
info['hospital_homepage'] =_[1]
try:
info['hospital_detial'] = get_hospital_info(_[2])
faculty_info = get_hospital_faculty(_[3])
all_faculty_info = []
except Exception as e:
import time
time.sleep(3*60)
info['hospital_detial'] = get_hospital_info(_[2])
faculty_info = get_hospital_faculty(_[3])
all_faculty_info = []
for _ in faculty_info:
try:
single_info= get_faculty_info(_['special_id'])
_['faculty_info'] = single_info
all_faculty_info.append(_)
except Exception as e:
import time
time.sleep(3*60)
single_info= get_faculty_info(_['special_id'])
_['faculty_info'] = single_info
all_faculty_info.append(_)
info['all_faculty_info'] = all_faculty_info
store(info,"./Info")
def store(info,filename):
import json
with open(filename,'a') as dest:
dest.write(json.dumps(info, ensure_ascii=False).encode('utf8') + '\n')
#done without try catch
def test_get_hospital_faculty():
test = get_hospital_faculty("http://www.haodf.com/hospital/DE4raCNSz6OmG3OUNZWCWNv0/keshi.htm")
for _ in test:
print _
return test
# done faculty_special_id from get_hospital_faculty
def test_get_faculty_info():
test = get_faculty_info("DE4r0BCkuHzduSnKssvRDydDzVOlL")
print test_hospital
# done with hospital_info
def test_get_hospital_info():
test = get_hospital_info("http://www.haodf.com/faculty/DE4roiYGYZw0JOrEpjdCy8jrf/jieshao.htm")
print test
#done , can get a hospital all info, but have a little problem with different platform
def test_one_hospital(url):
a,s,d,f = make_url(url)
one_hospital(a,s,d,f)
# area_url.txt是抽取了主页的list得到的(稍微改一下make_url就行)
with open('./area_url.txt') as area_list:
for area in area_list.readlines():
print "Now , we process in :" + area.strip('\n')
test_one_hospital(area.strip('\r\n'))
#万万没想到是\r惹的祸
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment