Skip to content

Instantly share code, notes, and snippets.

Last active January 4, 2016 14:39
Show Gist options
  • Save mcdlee/8635698 to your computer and use it in GitHub Desktop.
Save mcdlee/8635698 to your computer and use it in GitHub Desktop.
全省各家庭醫師整合性照護計畫院所查詢 -> csv
# -*- coding: utf-8 -*-
Created on Sun Jan 26 15:46:19 2014
@author: mcdlee
from lxml import etree
import urllib2
import unicodedata
def rm_blank(s):
u = s.replace(" ", "").replace("\t", "").replace("\n", "").replace("\r", "")
return u
db = [["clinic ref", "clinic name", "clinic addr", "city", "groupref", "group name", "hos 1 ref", "hos 1 name", "hos 2 ref", "hos 2 name", "hos 3 ref", "hos 3 name", "hos 4 ref", "hos 4 name"]]
url_primer = ''
for t in range(1,251):
url = url_primer +`t`
html = urllib2.urlopen(url).read()
tree = etree.HTML(html)
clin_list = tree[1][0][2][1][3][5][0][0][1][1][0][0][0]
for i in range(1,len(clin_list)):
clinic_ref = rm_blank(clin_list[i][3].text.encode('utf-8'))
clinic_name = rm_blank(clin_list[i][3][1].text.encode('utf-8'))
clinic_addr = rm_blank(unicodedata.normalize('NFKC', clin_list[i][3][3].tail).encode('utf-8'))
city = rm_blank(clin_list[i][0].text.encode('utf-8'))
group_ref = rm_blank(clin_list[i][1][0].text.encode('utf-8'))
group_name = rm_blank(clin_list[i][1][0][0].tail.encode('utf-8'))
hos_1_ref = rm_blank(clin_list[i][2][0].text.encode('utf-8')).replace("(第一)", "")
hos_1_name = rm_blank(clin_list[i][2][0][0].tail.encode('utf-8'))
if len(clin_list[i][2][1]) > 0:
hos_2_ref = rm_blank(clin_list[i][2][1].text.encode('utf-8')).replace("(第二)", "")
hos_2_name = rm_blank(clin_list[i][2][1][0].tail.encode('utf-8'))
hos_2_ref = ''
hos_2_name = ''
if len(clin_list[i][2][2]) > 0:
hos_3_ref = rm_blank(clin_list[i][2][2].text.encode('utf-8')).replace("(第三)", "")
hos_3_name = rm_blank(clin_list[i][2][2][0].tail.encode('utf-8'))
hos_3_ref = ''
hos_3_name = ''
if len(clin_list[i][2][3]) > 0:
hos_4_ref = rm_blank(clin_list[i][2][3].text.encode('utf-8')).replace("(第四)", "")
hos_4_name = rm_blank(clin_list[i][2][3][0].tail.encode('utf-8'))
hos_4_ref = ''
hos_4_name = ''
db.append([clinic_ref, clinic_name, clinic_addr, city, group_ref, group_name, hos_1_ref, hos_1_name, hos_2_ref, hos_2_name, hos_3_ref, hos_3_name, hos_4_ref, hos_4_name])
import csv
with open("list.csv", "wb") as f:
writer = csv.writer(f)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment