Skip to content

Instantly share code, notes, and snippets.

@wagurano
Last active December 20, 2015 14:58
Show Gist options
  • Save wagurano/6150271 to your computer and use it in GitHub Desktop.
Save wagurano/6150271 to your computer and use it in GitHub Desktop.
전국 어린이집 세부 정보를 가져옵니다. http://m.childcare.go.kr/nursery/mAllNurserySlPL.jsp?programId=null&flag=NSSlPL&offset=1411&ctprvn=28000&signgu=&dong=&road=&crtype=&crspec=&crcert=&crpub=&crname= 화면에서 2번째 항목(인천 남동구 오렌지 어린이집)을 클릭하면 오류가 발생함. 기타 경비를 출력할 때 "," 콤마 제외
#encoding: utf-8
require 'net/http'
require 'nokogiri'
require 'open-uri'
require 'timeout'
TIMEOUT_CNT = 42
#cd = '29200000641'
#cd = '11620000061'
def get_nursery_info(nursery_code, document)
hd = document.css("table.info_table th")
da = document.css("table.info_table td")
h = []
d = []
hd.each { |hh| h << hh }
da.each { |dd| d << dd }
for i in 0..h.length-1
puts "INFO,#{nursery_code},#{h[i].content.gsub(/,/,'')},#{d[i].content.gsub(/,/,':')}"
#puts "INFO,#{nursery_code},#{h[i].content},#{d[i].content.gsub(/,/,':')}"
end
end
def get_calss_info(nursery_code, children)
#children = kk[0].css("td")
cnt = 0
class_category = ""
class_name = ""
class_teachers = 0
class_children = 0
children.each do |child|
# puts "#{cnt%9}:#{child.content}"
class_category = child.content if (cnt % 9 == 0)
class_name = child.content if (cnt % 9 == 1)
class_teachers = child.content if (cnt % 9 == 2)
class_children = child.content if (cnt % 9 == 5)
if cnt % 9 == 8
puts "CHILD,#{nursery_code},#{class_category},#{class_name},#{class_teachers},#{class_children}"
end
cnt += 1
end
end
def get_staff(nursery_code, staffs)
#staffs = kk[1].css("td")
cnt = 0
staff_header = ["-","-","-","1급","2급","3급","특수교사","치료사","영양사","취사부","기타"]
staff_year = ""
year_to_category = { "6개월미만" => 0, "6개월 ~ 1년" => 1, "1년 ~ 2년" => 2, "2년 ~ 3년" => 3, "3년 ~ 4년" => 4, "4년 ~ 5년" => 5, "5년 이상" => 6 }
staffs.each do |st|
staff_year = st.content if cnt % 11 == 0
if staff_year == "계"
cnt += 1
next
end
if st.content.to_i > 0
if cnt % 11 > 2
puts "STAFF,#{nursery_code},#{staff_header[cnt%11]},#{year_to_category[staff_year]},#{st.content.strip}"
end
end
cnt += 1
end
end # get_staff
def get_fee(nursery_code, fees)
cnt = 0
cat = ""
fees.each do |f|
cat = f.content if cnt % 5 == 0
if f.content.to_i > 0 && cnt % 5 > 0
#puts "FEE,#{cat},#{cnt % 5 - 1},#{f.content.gsub(/,/,'').strip}"
puts "FEE,#{nursery_code},#{cnt % 5 - 1},#{cat},#{f.content.gsub(/,/,'').strip}"
end
cnt += 1
end
end
def get_fee2(nursery_code, fees)
cnt = 0
cat = ""
fees.each do |f|
cat = f.content if cnt % 4 == 0
if f.content.to_i > 0 && cnt % 4 > 0
#puts "FEE,#{cat},#{cnt % 5 - 1},#{f.content.gsub(/,/,'').strip}"
puts "FEE,#{nursery_code},#{cnt % 4 - 1 + 4},#{cat},#{f.content.gsub(/,/,'').strip}"
end
cnt += 1
end
end
def get_fee3(nursery_code, fees)
cnt = 0
cat = ""
cat2 = ""
period = ""
fees.each do |f|
cat = f.content if cnt % 4 == 0
cat2 = f.content if cnt % 4 == 1
period = f.content if cnt % 4 == 2
if cnt % 4 == 3
puts "FEE,#{nursery_code},#{cat},#{cat2.gsub(/,/,'')},#{period},#{f.content.gsub(/[,원]/,'').strip}"
end
cnt += 1
end
end
#<ul class="nursry_search_list">
#http://m.childcare.go.kr/nursery/mAllNurserySlPL.jsp?programId=null&flag=NSSlPL&offset=10000&ctprvn=11000&signgu=&dong=&road=&crtype=&crspec=&crcert=&crpub=&crname=&x=36&y=14
#http://m.childcare.go.kr/nursery/neighbored_foundSlPL.jsp?OpenItem=&flag=BISl&stcode=11620000061&nextClick=0&appAuth=&menu2=_on
def print_nursery(nursery_code)
cd = nursery_code
retries = TIMEOUT_CNT
url = "http://m.childcare.go.kr/nursery/neighbored_foundSlPL.jsp?OpenItem=&flag=BISl&stcode=#{cd}&nextClick=0&appAuth=&menu2=_on"
begin
Timeout::timeout(5) {
doc = Nokogiri::HTML(open(url))
if doc.nil?
puts "PRINT,#{cd},ERROR"
else
get_nursery_info(cd, doc)
kk = doc.css("table.state_table")
if kk.length < 2
puts "PRINT,#{cd},NO_STATE_TABLE,ERROR"
else
get_calss_info(cd, kk[0].css("td"))
get_staff(cd, kk[1].css("td"))
get_fee(cd, doc.css("table.pay_table01 td"))
get_fee2(cd, doc.css("table.pay_table02 td"))
get_fee3(cd, doc.css("table.pay_table td"))
end
end
}
rescue OpenURI::HTTPError
puts "ERROR,PRINT,#{nursery_code},HTTP"
rescue EOFError
puts "ERROR,PRINT,#{nursery_code},EOF"
retries -= 1
if retries > 0
puts "sleep"
sleep TIMEOUT_CNT - retries
retry
else
puts "raise"
raise
end
rescue Timeout::Error
retries -= 1
if retries > 0
puts "sleep"
sleep TIMEOUT_CNT - retries
retry
else
puts "raise"
raise
end
end # TIMEOUT
end
#province = ['11', '26', '27', '28', '29', '30', '36', '41', '42', '43', '44', '45','46', '47', '48','49']
province = ['28']
province.each do |prvn|
puts "FETCH,#{prvn}"
#for offset in 1..10000
offset = 1401 #5191 #prvn=41 #1 #6710
while true
break if offset > 1410
cnt = 0
codes = []
names = []
url = "http://m.childcare.go.kr/nursery/mAllNurserySlPL.jsp?programId=null&flag=NSSlPL&offset=#{offset}&ctprvn=#{prvn}000&signgu=&dong=&road=&crtype=&crspec=&crcert=&crpub=&crname="
retries = TIMEOUT_CNT
begin
Timeout::timeout(5) {
puts "FETCH,#{prvn},#{offset}"
doc = Nokogiri::HTML(open(url))
codes = []
names = []
kk = doc.at_css("ul.nursry_search_list").css("a").map { |link| link['href'] }
jj = doc.at_css("ul.nursry_search_list").css("li[@class = 'first orange']/strong")
kk.each { |hr| hr.scan(/'(.*)'/) { |x| codes << x[0]} }
jj.each { |nm| names << nm.content }
puts "NEXT,#{prvn},#{offset},#{codes.length}"
has_next = true if codes.length <= 0
if codes.length > 0
cnt = codes.length
end
}
rescue OpenURI::HTTPError
puts "FETCH,#{prvn},#{offset},ERROR"
next
rescue Timeout::Error
retries -= 1
if retries > 0
puts "sleep"
sleep 0.42 * (TIMEOUT_CNT - retries)
retry
else
puts "raise"
raise
end
end # TIMEOUT
for i in 0..codes.length-1
puts "PRINT,#{codes[i]}"
puts "INFO,#{codes[i]},이름,#{names[i]}"
print_nursery(codes[i])
end # for i
break if cnt <= 0
offset += cnt
end # offset 1 to 10000 step by codes.length
end # prvn
puts "END"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment