mylamour/convert_to_xls.py

## convert_to_xls.py
#!/usr/bin/env python
# coding:utf-8
"""
this code make me look like a idiot. and the O(n^2) is not a good idea, we can use a dict to store the first file info,and just go
through the second file, clearly, it can be reduce about (m-n), also, if we use binaray search to O(nlogN), but in python , use dict
to do this ,it just only O(1)

info sturct like this:  [{},{},{}]
i will optimized this code later and this code just want show how use the pandas dataframe to convert json to xlsx
"""

import json
from pandas import ExcelWriter,DataFrame


xlsx_writer = ExcelWriter('./doctor_henan.xlsx')

def store(info,filename):
	import json
	with open(filename,'a') as dest:
		dest.write(info+ '\n')

info = []
with open("./hospital_info_henan") as hjilin:#open("./doctor_info_jilin") as djilin:
	for hosptial in hjilin.readlines():
		hosptials = json.loads(hosptial)
		for _ in hosptials['all_faculty_info']:
			with open("./doctor_info_henan") as djilin:
				for doctor in djilin.readlines():
					try:
						doctors = json.loads(doctor)
						if doctors['faculty_id'] == _['special_id'] and doctors['docotor_info'] is not None:
							for person in doctors['docotor_info']:
								p_skill = []
								opt = {
									u'医生姓名': person['name'], #.encode('utf8'),
									u"职称":	person['grade'], #.encode('utf8'),
									u"等级":	person['educateGrade'], #.encode('utf8'),
									u"所属科室":	_['faculty_name'], #.encode('utf8') ,
									u"所属医院":	hosptials['hospital_name'], #.encode('utf8')
										}
								info.append(opt)
					except Exception as e:
						store(doctor,"./error_data")
						pass


test_df = DataFrame(info)

test_df.to_excel(xlsx_writer,'Sheet1')
xlsx_writer.save()

## count.py
#!/usr/bin/env python
# coding:utf-8

import json
from urlparse import urlparse

doctor_touch_urlList = []
hospital_urllist = []

def get_doctor_count(doctors_info_file):
	with open(doctors_info_file) as d:
		origin_count = 0
		for _ in d.readlines():
			test = json.loads(_)
			origin_count = origin_count + len(test['docotor_info'])

			if test['docotor_info'] is not None:
				for s_url in test['docotor_info']:
					doctor_touch_urlList.append(s_url['touchUrl'])

		docotors = list(set(doctor_touch_urlList))

		return origin_count , len(docotors)


def get_hospital_count(hosptials_info_file):
	with open(hosptials_info_file) as h:
		origin_count = 0
		for _ in h.readlines():
			origin_count = origin_count + 1

			test = json.loads(_)
			hospital_urllist.append(test['hospital_homepage'])
			# may be you need change the keyword, "hospital_homepage" to "hospital_url"
      # another reason is there was some spelling failure

		hospitals = list(set(hospital_urllist))
		return origin_count,len(hospitals)


o_h,n_h = get_hospital_count('hosptial_info_jilin')
o_d,n_d = get_doctor_count('doctor_info_jilin')

print "origin_hospital: " + str(o_h) + " unique_hospital: " + str(n_h) + '\n' + "origin_doctor: " + str(o_d) + " unique_doctor: " + str(n_d)

## doctor_info.py
#!/usr/bin/env python
# coding=utf-8

import requests,json
import time,random
import sys
reload(sys)
sys.setdefaultencoding('utf-8')

headers = {
    'host': "m.haodf.com",
    'accept': "*/*",
    'x-requested-with': "XMLHttpRequest",
    'user-agent': "Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1",
    'referer': "http://m.haodf.com/touch/faculty/DE4roiYGYZw0GIaCFVHDuJVht.htm",
    'accept-encoding': "gzip, deflate, sdch",
    'accept-language': "zh-CN,zh;q=0.8",
    'cookie': "NAVTOUCH=1; g=HDF.112.58abebaaf0985; Hm_lvt_dfa5478034171cc641b1639b2a5b717d=1487681206; _ga=GA1.2.2089374926.1487681207; newaskindex=1",
    'connection': "close",
    'cache-control': "no-cache"
    }

def store(info,filename):
	import json
	with open(filename,'a') as dest:
		dest.write(json.dumps(info, ensure_ascii=False).encode('utf8') + '\n')

# 思路受限了，应该直接content内容是否为空，无需考虑是否标志位为空
def get_doctor_info(faculty_id,p=0):

	info={}
	contents = []

	info[u'docotor_info'] = contents
	info[u'faculty_id'] = faculty_id

	url = "http://m.haodf.com/touch/faculty/loaddoctors/"+ faculty_id
	querystring = {"caseorphone":"0","booking":"0","p":p}

	response = requests.request("GET", url, headers=headers, params=querystring)
	docotor_info = json.loads(response.text)

	if docotor_info[u'contents'] is not None:
		for _ in docotor_info[u'contents']:
			info[u'docotor_info'].append(_)
		print info
		return info
	else:
		return None


def get_factor_doctor_info(faculty_id,p=0):

	test = get_doctor_info(faculty_id,p)
	if get_doctor_info(faculty_id,p) is None:
		return
	else:
		map(lambda x :test['docotor_info'].append(x),get_doctor_info(faculty_id,p+1)['docotor_info'])
		return store(test,'doctor_test')
#		return store(test,faculty_id)


with open('./faculty_id') as f:
	for _ in f.readlines():

		f_id = _.strip('\r\n')
		try:
			get_factor_doctor_info(f_id)
			time.sleep(random.random())				#0,1sec
		except Exception as e:
			time.sleep(3*60)
			get_factor_doctor_info(f_id)


# get_doctor_info("DE4r0eJWGqZNZYiaYhI2QCo55twNygxV")


## get_docurl_from_docid.js
var webPage = 	require('webpage');
var fs 		= 	require('fs');

var puredocid = fs.open('./testpuredocid', 'r');
var homepage = fs.open('./doc_homepage_url','+');


var preffix = 'http://www.haodf.com/doctor/';

var idlist = [];
var urllist = [];

while(!puredocid.atEnd()) {
    var id = puredocid.readLine();
    var url = preffix + id + '.htm';
    urllist.push(url);
    idlist.push(id);
}

puredocid.close();


function process(){
	if (urllist.length == 0){
		phantom.exit();
	} else{
		url = urllist.pop();
		page = require('webpage').create();
		page.open(url, onFinishedLoading)
	}
}


function onFinishedLoading(status){

		var currentUrl = page.evaluate(function() {
			 return [].map.call(document.querySelectorAll('#bp_doctor_about > div > div.middletr > div > div.doctor-home-page.clearfix > span:nth-child(3) > a'), function(link) {
		            return link.getAttribute('href');
		        });
		});

		if (currentUrl != "" ){
			console.log(currentUrl);
			homepage.writeLine(currentUrl);       //don't write to file and i don't know the reason
		}

		page.release();
		process();
}

process();

// this script exhaust lot time, feeling not good

## haodf_spider_doctor.py
#!/usr/bin/env python
# coding: utf-8

import requests,json
from urlparse import urlparse
from bs4 import BeautifulSoup
from docopt import docopt
import lxml

def get_hospital_office(hospital_office):
	hospital = []
	office = []
	for _ in hospital_office:

		h = _.text.split(" ")[0]
		o = _.text.split(" ")[1]

		hospital.append(h)
		office.append(o)

	return hospital, office


def parse_artical_info(artical_info):

	a_a = {}
	o_o = []

	for _ in artical_info :
            try:
				fl = _.text.split().index("人已读")
				a_a['artical_catalog'] = _.text.split()[0].strip("[]")
				a_a['artical_title'] = _.text.split()[1]
				a_a['read'] = _.text.split()[fl-1]
				artical_time = _.text.split()[fl+1]
				a_a['artical_time'] = artical_time.strip("发表于")
				a_a['origin_info'] = _.text.split() #原始文章数据备份，可自行处理
				o_o.append(a_a.copy())
            except Exception as e:
                pass
        return o_o

def get_artical_info_common(homepage,uname,page_count=1):

	lanmu_url = "http://" + homepage + "/lanmu"
	lanmu = requests.get(lanmu_url+"_"+str(page_count),headers=header)

	artical_soup = BeautifulSoup(lanmu.text,"lxml")

	check = "http://"+homepage+"/api/article/ajaxcategorylist?uname=" + uname

	flag = requests.get(check,headers=header)

	if len(flag.text) > 47:

		artical_info = artical_soup.select("ul.article_ul > li > div.clearfix")
		a_Oinfo = parse_artical_info(artical_info)

		try:
			all_page_count = artical_soup.select("a.page_turn_a")[0].text.strip()
			if all_page_count and all_page_count >= page_count:
				for page_number in range(2,int(all_page_count)+1):
					page_html = requests.get(lanmu_url+"_"+str(page_number),headers=header)
					articalsoup = BeautifulSoup(page_html.text,"lxml")
					artical_info = articalsoup.select("ul.article_ul > li > div.clearfix")

					a_Oinfo.append(parse_artical_info(artical_info))

		except Exception as e:
			pass

		return a_Oinfo
	else:
		return


def get_doctor_info(doctor_info,hospital_office):

	doctor_name = []
	doctor_hompage =[]
	doctor_artical_url = []
	doctor_skillful = []
	doctor_experience = []
	artical_info = []

	hospital, office = get_hospital_office(hospital_office)

	for _ in  doctor_info:

		url = urlparse(_['href'])
		homepage = url.netloc
		artical_url = homepage+"/lanmu"
		uname = homepage.split(".")[0]
		querystring = {"uname":uname}

                detail_url = "http://" + homepage + "/api/index/ajaxdoctorintro"
		details = requests.get(detail_url, headers = header,params=querystring)
		dsoup = BeautifulSoup(details.text,'lxml')
		skillful = dsoup.select(".pb15.bbd_e9 > p.hh")
		experience = dsoup.select(".pt15 > p.hh")

		# artical_info = get_artical_info(artical_url)

		doctor_name.append(_.text.split())
		doctor_hompage.append(homepage.split())
		doctor_artical_url.append(artical_url.split())
		doctor_skillful.append(skillful[0].text.split())
		doctor_experience.append(experience[0].text.split())


		t = get_artical_info_common(homepage,uname)
		artical_info.append(t)


	allinfo = zip(doctor_name,hospital,office,doctor_hompage,doctor_artical_url,doctor_skillful,doctor_experience,artical_info)

	store(allinfo,'ZheShiJiaDeHaodaifu')


def store(info,filename):
    import json
    with open(filename,'a') as dest:
        dest.write(json.dumps(info, ensure_ascii=False).encode('utf8') + '\n')

# if something was wrong, just use this to make a simple test
def simple_test(testurl):
    print testurl
    r = requests.get(testurl,headers=header)

    mainsoup = BeautifulSoup(r.text,"lxml")
    doctor_info= mainsoup.select(".fl > .mt5 > a")
    hospital_office = mainsoup.select(".fr > .fb")
    get_doctor_info(doctor_info,hospital_office)

def spider_worker(urllist):

	for url in urllist:
		try :
			import time,random
			time.sleep(int(random.uniform(3, 7)))
			r = requests.get(url,headers=header)
		except Exception as e:
			time.sleep(60*15)		# also you can use the generator to make a proxy yield or a middleware
			r = requests.get(url,headers=header)

		mainsoup = BeautifulSoup(r.text,"lxml")
		doctor_info= mainsoup.select(".fl > .mt5 > a")
		hospital_office = mainsoup.select(".fr > .fb")

		try :
			get_doctor_info(doctor_info,hospital_office)
		except Exception as e:
			time.sleep(60*15)
			get_doctor_info(doctor_info,hospital_office)

if __name__ == '__main__':

	import sys
	reload(sys)
	sys.setdefaultencoding('utf-8')

	arguments = docopt (__doc__,version= 'haodaifu spider 0.1')

	print arguments

	header = {
		"Accept":"*/*",
		"Accept-Encoding":"gzip, deflate, sdch, br",
		"Accept-Language":"zh-CN,zh;q=0.8",
		"Connection":"keep-alive",
		"Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
		"Cookie":"g=HDF.96.58a65f5833eab",
		"If-Modified-Since":"Fri, 17 Feb 2017 04:23:52 GMT",
		"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36"
	}


#		 start_url = 'http://400.haodf.com/index/search?diseasename=&province=&facultyname=&hosfaculty=&hospitalname=&nowpage='

#        test_url = 'http://400.haodf.com/index/search?diseasename=&province=&facultyname=&hosfaculty=&hospitalname=&nowpage=9'

	urllist = [start_url + str(i) for i in range(1,3911)]
	spider_worker(urllist)

## haodf_spider_hospital.py
#!/usr/bin/env python
# coding=utf-8

import requests,json
from urlparse import urlparse
from bs4 import BeautifulSoup
import lxml

#最开始打算把areaurlist 作为全局变量。并在make_url中返回该数据，觉得不好看，所以将url序列抽出后存了单独的txt。

import sys
reload(sys)
sys.setdefaultencoding('utf-8')

headers = {
	'host': "www.haodf.com",
	'connection': "keep-alive",
	'cache-control': "no-cache",
	'upgrade-insecure-requests': "1",
	'user-agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36",
	'accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
	'accept-encoding': "gzip, deflate, sdch",
	'accept-language': "zh-CN,zh;q=0.8",
	'cookie': "g=40982_1487298388760; __ads_session=fS8tLPu23Qh88vQLKgA=; NAVTOUCH=1; g=HDF.112.58abde6b2b204"
    }


# use hospital_special_id to get
def make_url(start_url):

	hospital_url_list = []
	hospital_name_list = []
	hospital_intro_url_list= []
	hospital_special_id_list = []

	test_hospital = requests.get(start_url, headers=headers)
	h_soup = BeautifulSoup(test_hospital.text,'lxml')
	start =  h_soup.select('.m_ctt_green a')

	for _ in start:
		hospital_name_list.append(_.text)

		try:
			special_hospital_id = _['href'].split("/")[2].strip(".htm")
			hospital_url = "http://www.haodf.com/hospital"+ _['href']		#just store homepage
			hospital_intro_url = "http://info.haodf.com/hospital/"+ special_hospital_id + "/jieshao.htm"
		except KeyError,IndexError:
			pass

		hospital_url_list.append(hospital_url)
		hospital_intro_url_list.append(hospital_intro_url)
		hospital_special_id_list.append(special_hospital_id)

	return hospital_name_list,hospital_url_list,hospital_intro_url_list,hospital_special_id_list


#return hospital intro and faculty info
def get_hospital_info(hospital_url):

	hinfo = requests.get(hospital_url,headers=headers)
	info_soup = BeautifulSoup(hinfo.text,'lxml')

	try:
		res = info_soup.select('.czsj td')[0].text
	except IndexError as e:
		res = ""
		pass
	return res


# get a single faculty info
def get_faculty_info(faculty_special_id):

	faculty_url = "http://www.haodf.com/faculty/" + faculty_special_id + "/jieshao.htm"
	hospital_faculty = requests.get(faculty_url, headers=headers)
	faculty_soup = BeautifulSoup(hospital_faculty.text,'lxml')

	try:
		faculty_detail = faculty_soup.select('#about_det')[0].text
	except IndexError as e:
		faculty_detail = ""
		pass

	return faculty_detail

#get a hostoital all faculty url like: http://www.haodf.com/hospital/DE4raCNSz6OmG3OUNZWCWNv0/keshi.htm
def get_hospital_faculty(hospital_special_id):

	url = "http://www.haodf.com/hospital/ " + hospital_special_id + "/keshi.htm"

	hospital_faculty_all = []
	faculty_info={}

	tmp = requests.get(url,headers=headers)
	t_soup = BeautifulSoup(tmp.text,'lxml')
	faculty_list = t_soup.select('.bluepanel .blue')

	for _ in faculty_list:
		faculty_info['faculty_name'] = _.text
		faculty_info['url'] = _['href']
		faculty_info['special_id'] = urlparse(_['href']).path.split('/')[2].split('.')[0]

		hospital_faculty_all.append(faculty_info.copy())

	return hospital_faculty_all


def one_hospital(hospital_name_list,hospital_url_list,hospital_intro_url_list,hospital_special_id_list):
	info = {}
	for _ in zip(hospital_name_list,hospital_url_list,hospital_intro_url_list,hospital_special_id_list):
		info['hospital_name'] = _[0]
		info['hospital_homepage'] =_[1]

		try:
			info['hospital_detial']	= get_hospital_info(_[2])
			faculty_info = get_hospital_faculty(_[3])
			all_faculty_info = []
		except Exception as e:
			import time
			time.sleep(3*60)

			info['hospital_detial']	= get_hospital_info(_[2])
			faculty_info = get_hospital_faculty(_[3])
			all_faculty_info = []

		for _ in faculty_info:
			try:
				single_info= get_faculty_info(_['special_id'])
				_['faculty_info'] = single_info

				all_faculty_info.append(_)

			except Exception as e:
				import time
				time.sleep(3*60)
				single_info= get_faculty_info(_['special_id'])
				_['faculty_info'] = single_info
				all_faculty_info.append(_)

		info['all_faculty_info'] = all_faculty_info

		store(info,"./Info")

def store(info,filename):
	import json
	with open(filename,'a') as dest:
		dest.write(json.dumps(info, ensure_ascii=False).encode('utf8') + '\n')

#done without try catch
def test_get_hospital_faculty():
	test = get_hospital_faculty("http://www.haodf.com/hospital/DE4raCNSz6OmG3OUNZWCWNv0/keshi.htm")
	for _ in test:
		print _
	return test

# done faculty_special_id from get_hospital_faculty
def test_get_faculty_info():

	test = get_faculty_info("DE4r0BCkuHzduSnKssvRDydDzVOlL")
	print test_hospital

# done with hospital_info
def test_get_hospital_info():
	test = get_hospital_info("http://www.haodf.com/faculty/DE4roiYGYZw0JOrEpjdCy8jrf/jieshao.htm")
	print test
#done , can get a hospital all info, but have a little problem with different platform
def test_one_hospital(url):
	a,s,d,f = make_url(url)
	one_hospital(a,s,d,f)


# area_url.txt是抽取了主页的list得到的(稍微改一下make_url就行)
with open('./area_url.txt') as area_list:
	for area in area_list.readlines():
		print "Now , we process in :" + area.strip('\n')
		test_one_hospital(area.strip('\r\n'))
		#万万没想到是\r惹的祸
	#!/usr/bin/env python
	# coding:utf-8
	"""
	this code make me look like a idiot. and the O(n^2) is not a good idea, we can use a dict to store the first file info,and just go
	through the second file, clearly, it can be reduce about (m-n), also, if we use binaray search to O(nlogN), but in python , use dict
	to do this ,it just only O(1)

	info sturct like this: [{},{},{}]
	i will optimized this code later and this code just want show how use the pandas dataframe to convert json to xlsx
	"""

	import json
	from pandas import ExcelWriter,DataFrame


	xlsx_writer = ExcelWriter('./doctor_henan.xlsx')

	def store(info,filename):
	import json
	with open(filename,'a') as dest:
	dest.write(info+ '\n')

	info = []
	with open("./hospital_info_henan") as hjilin:#open("./doctor_info_jilin") as djilin:
	for hosptial in hjilin.readlines():
	hosptials = json.loads(hosptial)
	for _ in hosptials['all_faculty_info']:
	with open("./doctor_info_henan") as djilin:
	for doctor in djilin.readlines():
	try:
	doctors = json.loads(doctor)
	if doctors['faculty_id'] == _['special_id'] and doctors['docotor_info'] is not None:
	for person in doctors['docotor_info']:
	p_skill = []
	opt = {
	u'医生姓名': person['name'], #.encode('utf8'),
	u"职称": person['grade'], #.encode('utf8'),
	u"等级": person['educateGrade'], #.encode('utf8'),
	u"所属科室": _['faculty_name'], #.encode('utf8') ,
	u"所属医院": hosptials['hospital_name'], #.encode('utf8')
	}
	info.append(opt)
	except Exception as e:
	store(doctor,"./error_data")
	pass


	test_df = DataFrame(info)

	test_df.to_excel(xlsx_writer,'Sheet1')
	xlsx_writer.save()
	#!/usr/bin/env python
	# coding=utf-8

	import requests,json
	import time,random
	import sys
	reload(sys)
	sys.setdefaultencoding('utf-8')

	headers = {
	'host': "m.haodf.com",
	'accept': "/",
	'x-requested-with': "XMLHttpRequest",
	'user-agent': "Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1",
	'referer': "http://m.haodf.com/touch/faculty/DE4roiYGYZw0GIaCFVHDuJVht.htm",
	'accept-encoding': "gzip, deflate, sdch",
	'accept-language': "zh-CN,zh;q=0.8",
	'cookie': "NAVTOUCH=1; g=HDF.112.58abebaaf0985; Hm_lvt_dfa5478034171cc641b1639b2a5b717d=1487681206; _ga=GA1.2.2089374926.1487681207; newaskindex=1",
	'connection': "close",
	'cache-control': "no-cache"
	}

	def store(info,filename):
	import json
	with open(filename,'a') as dest:
	dest.write(json.dumps(info, ensure_ascii=False).encode('utf8') + '\n')

	# 思路受限了，应该直接content内容是否为空，无需考虑是否标志位为空
	def get_doctor_info(faculty_id,p=0):

	info={}
	contents = []

	info[u'docotor_info'] = contents
	info[u'faculty_id'] = faculty_id

	url = "http://m.haodf.com/touch/faculty/loaddoctors/"+ faculty_id
	querystring = {"caseorphone":"0","booking":"0","p":p}

	response = requests.request("GET", url, headers=headers, params=querystring)
	docotor_info = json.loads(response.text)

	if docotor_info[u'contents'] is not None:
	for _ in docotor_info[u'contents']:
	info[u'docotor_info'].append(_)
	print info
	return info
	else:
	return None


	def get_factor_doctor_info(faculty_id,p=0):

	test = get_doctor_info(faculty_id,p)
	if get_doctor_info(faculty_id,p) is None:
	return
	else:
	map(lambda x :test['docotor_info'].append(x),get_doctor_info(faculty_id,p+1)['docotor_info'])
	return store(test,'doctor_test')
	# return store(test,faculty_id)


	with open('./faculty_id') as f:
	for _ in f.readlines():

	f_id = _.strip('\r\n')
	try:
	get_factor_doctor_info(f_id)
	time.sleep(random.random()) #0,1sec
	except Exception as e:
	time.sleep(3*60)
	get_factor_doctor_info(f_id)



	# get_doctor_info("DE4r0eJWGqZNZYiaYhI2QCo55twNygxV")
	var webPage = require('webpage');
	var fs = require('fs');

	var puredocid = fs.open('./testpuredocid', 'r');
	var homepage = fs.open('./doc_homepage_url','+');


	var preffix = 'http://www.haodf.com/doctor/';

	var idlist = [];
	var urllist = [];

	while(!puredocid.atEnd()) {
	var id = puredocid.readLine();
	var url = preffix + id + '.htm';
	urllist.push(url);
	idlist.push(id);
	}

	puredocid.close();


	function process(){
	if (urllist.length == 0){
	phantom.exit();
	} else{
	url = urllist.pop();
	page = require('webpage').create();
	page.open(url, onFinishedLoading)
	}
	}


	function onFinishedLoading(status){

	var currentUrl = page.evaluate(function() {
	return [].map.call(document.querySelectorAll('#bp_doctor_about > div > div.middletr > div > div.doctor-home-page.clearfix > span:nth-child(3) > a'), function(link) {
	return link.getAttribute('href');
	});
	});

	if (currentUrl != "" ){
	console.log(currentUrl);
	homepage.writeLine(currentUrl); //don't write to file and i don't know the reason
	}

	page.release();
	process();
	}

	process();

	// this script exhaust lot time, feeling not good
	#!/usr/bin/env python
	# coding: utf-8

	import requests,json
	from urlparse import urlparse
	from bs4 import BeautifulSoup
	from docopt import docopt
	import lxml

	def get_hospital_office(hospital_office):
	hospital = []
	office = []
	for _ in hospital_office:

	h = _.text.split(" ")[0]
	o = _.text.split(" ")[1]

	hospital.append(h)
	office.append(o)

	return hospital, office


	def parse_artical_info(artical_info):

	a_a = {}
	o_o = []

	for _ in artical_info :
	try:
	fl = _.text.split().index("人已读")
	a_a['artical_catalog'] = _.text.split()[0].strip("[]")
	a_a['artical_title'] = _.text.split()[1]
	a_a['read'] = _.text.split()[fl-1]
	artical_time = _.text.split()[fl+1]
	a_a['artical_time'] = artical_time.strip("发表于")
	a_a['origin_info'] = _.text.split() #原始文章数据备份，可自行处理
	o_o.append(a_a.copy())
	except Exception as e:
	pass
	return o_o

	def get_artical_info_common(homepage,uname,page_count=1):

	lanmu_url = "http://" + homepage + "/lanmu"
	lanmu = requests.get(lanmu_url+"_"+str(page_count),headers=header)

	artical_soup = BeautifulSoup(lanmu.text,"lxml")

	check = "http://"+homepage+"/api/article/ajaxcategorylist?uname=" + uname

	flag = requests.get(check,headers=header)

	if len(flag.text) > 47:

	artical_info = artical_soup.select("ul.article_ul > li > div.clearfix")
	a_Oinfo = parse_artical_info(artical_info)

	try:
	all_page_count = artical_soup.select("a.page_turn_a")[0].text.strip()
	if all_page_count and all_page_count >= page_count:
	for page_number in range(2,int(all_page_count)+1):
	page_html = requests.get(lanmu_url+"_"+str(page_number),headers=header)
	articalsoup = BeautifulSoup(page_html.text,"lxml")
	artical_info = articalsoup.select("ul.article_ul > li > div.clearfix")

	a_Oinfo.append(parse_artical_info(artical_info))

	except Exception as e:
	pass

	return a_Oinfo
	else:
	return


	def get_doctor_info(doctor_info,hospital_office):

	doctor_name = []
	doctor_hompage =[]
	doctor_artical_url = []
	doctor_skillful = []
	doctor_experience = []
	artical_info = []

	hospital, office = get_hospital_office(hospital_office)

	for _ in doctor_info:

	url = urlparse(_['href'])
	homepage = url.netloc
	artical_url = homepage+"/lanmu"
	uname = homepage.split(".")[0]
	querystring = {"uname":uname}

	detail_url = "http://" + homepage + "/api/index/ajaxdoctorintro"
	details = requests.get(detail_url, headers = header,params=querystring)
	dsoup = BeautifulSoup(details.text,'lxml')
	skillful = dsoup.select(".pb15.bbd_e9 > p.hh")
	experience = dsoup.select(".pt15 > p.hh")

	# artical_info = get_artical_info(artical_url)

	doctor_name.append(_.text.split())
	doctor_hompage.append(homepage.split())
	doctor_artical_url.append(artical_url.split())
	doctor_skillful.append(skillful[0].text.split())
	doctor_experience.append(experience[0].text.split())


	t = get_artical_info_common(homepage,uname)
	artical_info.append(t)


	allinfo = zip(doctor_name,hospital,office,doctor_hompage,doctor_artical_url,doctor_skillful,doctor_experience,artical_info)

	store(allinfo,'ZheShiJiaDeHaodaifu')


	def store(info,filename):
	import json
	with open(filename,'a') as dest:
	dest.write(json.dumps(info, ensure_ascii=False).encode('utf8') + '\n')

	# if something was wrong, just use this to make a simple test
	def simple_test(testurl):
	print testurl
	r = requests.get(testurl,headers=header)

	mainsoup = BeautifulSoup(r.text,"lxml")
	doctor_info= mainsoup.select(".fl > .mt5 > a")
	hospital_office = mainsoup.select(".fr > .fb")
	get_doctor_info(doctor_info,hospital_office)

	def spider_worker(urllist):

	for url in urllist:
	try :
	import time,random
	time.sleep(int(random.uniform(3, 7)))
	r = requests.get(url,headers=header)
	except Exception as e:
	time.sleep(60*15) # also you can use the generator to make a proxy yield or a middleware
	r = requests.get(url,headers=header)

	mainsoup = BeautifulSoup(r.text,"lxml")
	doctor_info= mainsoup.select(".fl > .mt5 > a")
	hospital_office = mainsoup.select(".fr > .fb")

	try :
	get_doctor_info(doctor_info,hospital_office)
	except Exception as e:
	time.sleep(60*15)
	get_doctor_info(doctor_info,hospital_office)

	if __name__ == '__main__':

	import sys
	reload(sys)
	sys.setdefaultencoding('utf-8')

	arguments = docopt (__doc__,version= 'haodaifu spider 0.1')

	print arguments

	header = {
	"Accept":"/",
	"Accept-Encoding":"gzip, deflate, sdch, br",
	"Accept-Language":"zh-CN,zh;q=0.8",
	"Connection":"keep-alive",
	"Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,/;q=0.8",
	"Cookie":"g=HDF.96.58a65f5833eab",
	"If-Modified-Since":"Fri, 17 Feb 2017 04:23:52 GMT",
	"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36"
	}


	# start_url = 'http://400.haodf.com/index/search?diseasename=&province=&facultyname=&hosfaculty=&hospitalname=&nowpage='

	# test_url = 'http://400.haodf.com/index/search?diseasename=&province=&facultyname=&hosfaculty=&hospitalname=&nowpage=9'

	urllist = [start_url + str(i) for i in range(1,3911)]
	spider_worker(urllist)