Skip to content

Instantly share code, notes, and snippets.

@1ambda
Last active August 29, 2015 13:58
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save 1ambda/10235360 to your computer and use it in GitHub Desktop.
Save 1ambda/10235360 to your computer and use it in GitHub Desktop.
# -*- coding: cp949 -*-
from bs4 import BeautifulSoup
import urllib2
import re
# print soup.original_encoding
# get all certification link
url = "http://www.hrd.go.kr/jsp/HRDP/HRDP300/HRDP310/HRDP311/HRDP311_2List.jsp?keco_cd=&keco_nm=&orderField=jm_nm&orderDir=asc&pageNo=1&jm_cd=&kecoM=&kecoS=&srchKey=&rowsPerPage=1000"
contents = urllib2.urlopen(url).read().decode('cp949', 'ignore')
# soup = BeautifulSoup(contents.decode('euc-kr').encode('cp949'))
soup = BeautifulSoup(contents)
all_link = soup.find_all("a", attrs={"href" : "#tmp"})
# f = file("certlink.csv", "w")
f = file("cert.csv", "w")
cert_basic_url = "http://www.hrd.go.kr/jsp/HRDP/popup/popLicenseView.jsp?jm_cd="
for i in range(len(all_link)):
link_index = str(all_link[i].get('onclick'))
if re.match('^viewLicDetail', link_index):
# get cert index number from url
number = re.search('\\d\\d\\d\\d', link_index).group()
url = "http://www.hrd.go.kr/jsp/HRDP/popup/popLicenseView.jsp?jm_cd=" + number
contents = urllib2.urlopen(url).read().decode('cp949', 'ignore')
soup = BeautifulSoup(contents)
result = soup.find_all("h2", {"class", "tit", "bgNone", "pLNo"})[0]
title = result.get_text()
# if valid page
if title != " ":
## get cert description text
desc_tab = soup.find(id="tabReview01")
title_desc_soup = desc_tab.find("p")
title_desc = "None"
if title_desc_soup:
title_desc = title_desc_soup.text
title_desc = title_desc.strip()
else:
pass
## get exam info
exam_tab = soup.find(id="tabReview02")
if exam_tab:
table_header = exam_tab.find_all("th")
header_count = len(table_header)
table_body = exam_tab.find_all("td")
valid_body_length = len(table_body) / header_count - 1
for i in range(valid_body_length):
text = "\"" + title.strip() + "\","
text += "\"" + title_desc.strip() + "\","
text += "\"" + table_body[i*header_count + 1].text.strip() + "\","
text += "\"" + table_body[i*header_count + 2].text.strip() + "\", "
text += "\"" + table_body[i*header_count + 3].text.strip() + "\", "
text += "\"" + table_body[i*header_count + 4].text.strip() + "\"\n"
print text.encode('mbcs')
f.write(text.encode('mbcs'))
else:
pass
else:
pass
f.close()
@1ambda
Copy link
Author

1ambda commented Apr 9, 2014

Dep : bs4, urllib2, re

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment