Last active
January 4, 2016 14:39
-
-
Save mcdlee/8635698 to your computer and use it in GitHub Desktop.
全省各家庭醫師整合性照護計畫院所查詢 -> csv
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
""" | |
Created on Sun Jan 26 15:46:19 2014 | |
@author: mcdlee | |
""" | |
from lxml import etree | |
import urllib2 | |
import unicodedata | |
def rm_blank(s): | |
u = s.replace(" ", "").replace("\t", "").replace("\n", "").replace("\r", "") | |
return u | |
db = [["clinic ref", "clinic name", "clinic addr", "city", "groupref", "group name", "hos 1 ref", "hos 1 name", "hos 2 ref", "hos 2 name", "hos 3 ref", "hos 3 name", "hos 4 ref", "hos 4 name"]] | |
url_primer = 'http://www.nhi.gov.tw/OnlineQuery/FamilyDrSearch.aspx?menu=20&menu_id=926&webdata_id=3661&WD_ID=929&QueryType=2&City=&Area=&HName=&HID=&CName=&CID=&H1Name=&H1ID=&page=' | |
for t in range(1,251): | |
url = url_primer +`t` | |
html = urllib2.urlopen(url).read() | |
tree = etree.HTML(html) | |
clin_list = tree[1][0][2][1][3][5][0][0][1][1][0][0][0] | |
for i in range(1,len(clin_list)): | |
clinic_ref = rm_blank(clin_list[i][3].text.encode('utf-8')) | |
clinic_name = rm_blank(clin_list[i][3][1].text.encode('utf-8')) | |
clinic_addr = rm_blank(unicodedata.normalize('NFKC', clin_list[i][3][3].tail).encode('utf-8')) | |
city = rm_blank(clin_list[i][0].text.encode('utf-8')) | |
group_ref = rm_blank(clin_list[i][1][0].text.encode('utf-8')) | |
group_name = rm_blank(clin_list[i][1][0][0].tail.encode('utf-8')) | |
hos_1_ref = rm_blank(clin_list[i][2][0].text.encode('utf-8')).replace("(第一)", "") | |
hos_1_name = rm_blank(clin_list[i][2][0][0].tail.encode('utf-8')) | |
if len(clin_list[i][2][1]) > 0: | |
hos_2_ref = rm_blank(clin_list[i][2][1].text.encode('utf-8')).replace("(第二)", "") | |
hos_2_name = rm_blank(clin_list[i][2][1][0].tail.encode('utf-8')) | |
else: | |
hos_2_ref = '' | |
hos_2_name = '' | |
if len(clin_list[i][2][2]) > 0: | |
hos_3_ref = rm_blank(clin_list[i][2][2].text.encode('utf-8')).replace("(第三)", "") | |
hos_3_name = rm_blank(clin_list[i][2][2][0].tail.encode('utf-8')) | |
else: | |
hos_3_ref = '' | |
hos_3_name = '' | |
if len(clin_list[i][2][3]) > 0: | |
hos_4_ref = rm_blank(clin_list[i][2][3].text.encode('utf-8')).replace("(第四)", "") | |
hos_4_name = rm_blank(clin_list[i][2][3][0].tail.encode('utf-8')) | |
else: | |
hos_4_ref = '' | |
hos_4_name = '' | |
db.append([clinic_ref, clinic_name, clinic_addr, city, group_ref, group_name, hos_1_ref, hos_1_name, hos_2_ref, hos_2_name, hos_3_ref, hos_3_name, hos_4_ref, hos_4_name]) | |
import csv | |
with open("list.csv", "wb") as f: | |
writer = csv.writer(f) | |
writer.writerows(db) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment