Last active
December 20, 2015 14:58
-
-
Save wagurano/6150271 to your computer and use it in GitHub Desktop.
전국 어린이집 세부 정보를 가져옵니다.
http://m.childcare.go.kr/nursery/mAllNurserySlPL.jsp?programId=null&flag=NSSlPL&offset=1411&ctprvn=28000&signgu=&dong=&road=&crtype=&crspec=&crcert=&crpub=&crname=
화면에서 2번째 항목(인천 남동구 오렌지 어린이집)을 클릭하면 오류가 발생함.
기타 경비를 출력할 때 "," 콤마 제외
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#encoding: utf-8 | |
require 'net/http' | |
require 'nokogiri' | |
require 'open-uri' | |
require 'timeout' | |
TIMEOUT_CNT = 42 | |
#cd = '29200000641' | |
#cd = '11620000061' | |
def get_nursery_info(nursery_code, document) | |
hd = document.css("table.info_table th") | |
da = document.css("table.info_table td") | |
h = [] | |
d = [] | |
hd.each { |hh| h << hh } | |
da.each { |dd| d << dd } | |
for i in 0..h.length-1 | |
puts "INFO,#{nursery_code},#{h[i].content.gsub(/,/,'')},#{d[i].content.gsub(/,/,':')}" | |
#puts "INFO,#{nursery_code},#{h[i].content},#{d[i].content.gsub(/,/,':')}" | |
end | |
end | |
def get_calss_info(nursery_code, children) | |
#children = kk[0].css("td") | |
cnt = 0 | |
class_category = "" | |
class_name = "" | |
class_teachers = 0 | |
class_children = 0 | |
children.each do |child| | |
# puts "#{cnt%9}:#{child.content}" | |
class_category = child.content if (cnt % 9 == 0) | |
class_name = child.content if (cnt % 9 == 1) | |
class_teachers = child.content if (cnt % 9 == 2) | |
class_children = child.content if (cnt % 9 == 5) | |
if cnt % 9 == 8 | |
puts "CHILD,#{nursery_code},#{class_category},#{class_name},#{class_teachers},#{class_children}" | |
end | |
cnt += 1 | |
end | |
end | |
def get_staff(nursery_code, staffs) | |
#staffs = kk[1].css("td") | |
cnt = 0 | |
staff_header = ["-","-","-","1급","2급","3급","특수교사","치료사","영양사","취사부","기타"] | |
staff_year = "" | |
year_to_category = { "6개월미만" => 0, "6개월 ~ 1년" => 1, "1년 ~ 2년" => 2, "2년 ~ 3년" => 3, "3년 ~ 4년" => 4, "4년 ~ 5년" => 5, "5년 이상" => 6 } | |
staffs.each do |st| | |
staff_year = st.content if cnt % 11 == 0 | |
if staff_year == "계" | |
cnt += 1 | |
next | |
end | |
if st.content.to_i > 0 | |
if cnt % 11 > 2 | |
puts "STAFF,#{nursery_code},#{staff_header[cnt%11]},#{year_to_category[staff_year]},#{st.content.strip}" | |
end | |
end | |
cnt += 1 | |
end | |
end # get_staff | |
def get_fee(nursery_code, fees) | |
cnt = 0 | |
cat = "" | |
fees.each do |f| | |
cat = f.content if cnt % 5 == 0 | |
if f.content.to_i > 0 && cnt % 5 > 0 | |
#puts "FEE,#{cat},#{cnt % 5 - 1},#{f.content.gsub(/,/,'').strip}" | |
puts "FEE,#{nursery_code},#{cnt % 5 - 1},#{cat},#{f.content.gsub(/,/,'').strip}" | |
end | |
cnt += 1 | |
end | |
end | |
def get_fee2(nursery_code, fees) | |
cnt = 0 | |
cat = "" | |
fees.each do |f| | |
cat = f.content if cnt % 4 == 0 | |
if f.content.to_i > 0 && cnt % 4 > 0 | |
#puts "FEE,#{cat},#{cnt % 5 - 1},#{f.content.gsub(/,/,'').strip}" | |
puts "FEE,#{nursery_code},#{cnt % 4 - 1 + 4},#{cat},#{f.content.gsub(/,/,'').strip}" | |
end | |
cnt += 1 | |
end | |
end | |
def get_fee3(nursery_code, fees) | |
cnt = 0 | |
cat = "" | |
cat2 = "" | |
period = "" | |
fees.each do |f| | |
cat = f.content if cnt % 4 == 0 | |
cat2 = f.content if cnt % 4 == 1 | |
period = f.content if cnt % 4 == 2 | |
if cnt % 4 == 3 | |
puts "FEE,#{nursery_code},#{cat},#{cat2.gsub(/,/,'')},#{period},#{f.content.gsub(/[,원]/,'').strip}" | |
end | |
cnt += 1 | |
end | |
end | |
#<ul class="nursry_search_list"> | |
#http://m.childcare.go.kr/nursery/mAllNurserySlPL.jsp?programId=null&flag=NSSlPL&offset=10000&ctprvn=11000&signgu=&dong=&road=&crtype=&crspec=&crcert=&crpub=&crname=&x=36&y=14 | |
#http://m.childcare.go.kr/nursery/neighbored_foundSlPL.jsp?OpenItem=&flag=BISl&stcode=11620000061&nextClick=0&appAuth=&menu2=_on | |
def print_nursery(nursery_code) | |
cd = nursery_code | |
retries = TIMEOUT_CNT | |
url = "http://m.childcare.go.kr/nursery/neighbored_foundSlPL.jsp?OpenItem=&flag=BISl&stcode=#{cd}&nextClick=0&appAuth=&menu2=_on" | |
begin | |
Timeout::timeout(5) { | |
doc = Nokogiri::HTML(open(url)) | |
if doc.nil? | |
puts "PRINT,#{cd},ERROR" | |
else | |
get_nursery_info(cd, doc) | |
kk = doc.css("table.state_table") | |
if kk.length < 2 | |
puts "PRINT,#{cd},NO_STATE_TABLE,ERROR" | |
else | |
get_calss_info(cd, kk[0].css("td")) | |
get_staff(cd, kk[1].css("td")) | |
get_fee(cd, doc.css("table.pay_table01 td")) | |
get_fee2(cd, doc.css("table.pay_table02 td")) | |
get_fee3(cd, doc.css("table.pay_table td")) | |
end | |
end | |
} | |
rescue OpenURI::HTTPError | |
puts "ERROR,PRINT,#{nursery_code},HTTP" | |
rescue EOFError | |
puts "ERROR,PRINT,#{nursery_code},EOF" | |
retries -= 1 | |
if retries > 0 | |
puts "sleep" | |
sleep TIMEOUT_CNT - retries | |
retry | |
else | |
puts "raise" | |
raise | |
end | |
rescue Timeout::Error | |
retries -= 1 | |
if retries > 0 | |
puts "sleep" | |
sleep TIMEOUT_CNT - retries | |
retry | |
else | |
puts "raise" | |
raise | |
end | |
end # TIMEOUT | |
end | |
#province = ['11', '26', '27', '28', '29', '30', '36', '41', '42', '43', '44', '45','46', '47', '48','49'] | |
province = ['28'] | |
province.each do |prvn| | |
puts "FETCH,#{prvn}" | |
#for offset in 1..10000 | |
offset = 1401 #5191 #prvn=41 #1 #6710 | |
while true | |
break if offset > 1410 | |
cnt = 0 | |
codes = [] | |
names = [] | |
url = "http://m.childcare.go.kr/nursery/mAllNurserySlPL.jsp?programId=null&flag=NSSlPL&offset=#{offset}&ctprvn=#{prvn}000&signgu=&dong=&road=&crtype=&crspec=&crcert=&crpub=&crname=" | |
retries = TIMEOUT_CNT | |
begin | |
Timeout::timeout(5) { | |
puts "FETCH,#{prvn},#{offset}" | |
doc = Nokogiri::HTML(open(url)) | |
codes = [] | |
names = [] | |
kk = doc.at_css("ul.nursry_search_list").css("a").map { |link| link['href'] } | |
jj = doc.at_css("ul.nursry_search_list").css("li[@class = 'first orange']/strong") | |
kk.each { |hr| hr.scan(/'(.*)'/) { |x| codes << x[0]} } | |
jj.each { |nm| names << nm.content } | |
puts "NEXT,#{prvn},#{offset},#{codes.length}" | |
has_next = true if codes.length <= 0 | |
if codes.length > 0 | |
cnt = codes.length | |
end | |
} | |
rescue OpenURI::HTTPError | |
puts "FETCH,#{prvn},#{offset},ERROR" | |
next | |
rescue Timeout::Error | |
retries -= 1 | |
if retries > 0 | |
puts "sleep" | |
sleep 0.42 * (TIMEOUT_CNT - retries) | |
retry | |
else | |
puts "raise" | |
raise | |
end | |
end # TIMEOUT | |
for i in 0..codes.length-1 | |
puts "PRINT,#{codes[i]}" | |
puts "INFO,#{codes[i]},이름,#{names[i]}" | |
print_nursery(codes[i]) | |
end # for i | |
break if cnt <= 0 | |
offset += cnt | |
end # offset 1 to 10000 step by codes.length | |
end # prvn | |
puts "END" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment