1ambda/collecter.py

## collecter.py
# -*- coding: cp949 -*-
from bs4 import BeautifulSoup
import urllib2
import re

# print soup.original_encoding

# get all certification link
url = "http://www.hrd.go.kr/jsp/HRDP/HRDP300/HRDP310/HRDP311/HRDP311_2List.jsp?keco_cd=&keco_nm=&orderField=jm_nm&orderDir=asc&pageNo=1&jm_cd=&kecoM=&kecoS=&srchKey=&rowsPerPage=1000"
contents = urllib2.urlopen(url).read().decode('cp949', 'ignore')
# soup = BeautifulSoup(contents.decode('euc-kr').encode('cp949'))
soup = BeautifulSoup(contents)
all_link = soup.find_all("a", attrs={"href" : "#tmp"})

# f = file("certlink.csv", "w")
f = file("cert.csv", "w")
cert_basic_url = "http://www.hrd.go.kr/jsp/HRDP/popup/popLicenseView.jsp?jm_cd="

for i in range(len(all_link)):
    link_index = str(all_link[i].get('onclick'))
    if re.match('^viewLicDetail', link_index):
        # get cert index number from url
        number = re.search('\\d\\d\\d\\d', link_index).group()
        url = "http://www.hrd.go.kr/jsp/HRDP/popup/popLicenseView.jsp?jm_cd=" + number
        contents = urllib2.urlopen(url).read().decode('cp949', 'ignore')
        soup = BeautifulSoup(contents)
        result = soup.find_all("h2", {"class", "tit", "bgNone", "pLNo"})[0]
        title = result.get_text()

        # if valid page
        if title != " ":
            ## get cert description text
            desc_tab = soup.find(id="tabReview01")
            title_desc_soup = desc_tab.find("p")
            title_desc = "None"

            if title_desc_soup:
                title_desc = title_desc_soup.text
                title_desc = title_desc.strip()
            else:
                pass

            ## get exam info
            exam_tab = soup.find(id="tabReview02")

            if exam_tab:
                table_header = exam_tab.find_all("th")
                header_count = len(table_header)
                table_body = exam_tab.find_all("td")
                valid_body_length = len(table_body) / header_count - 1
                for i in range(valid_body_length):
                    text = "\"" + title.strip() + "\","
                    text += "\"" + title_desc.strip() + "\","
                    text += "\"" + table_body[i*header_count + 1].text.strip() + "\","
                    text += "\"" + table_body[i*header_count + 2].text.strip() + "\", "
                    text += "\"" + table_body[i*header_count + 3].text.strip() + "\", "
                    text += "\"" + table_body[i*header_count + 4].text.strip() + "\"\n"
                    print text.encode('mbcs')
                    f.write(text.encode('mbcs'))
                else:
                    pass
            else:
                pass

f.close()
	# -- coding: cp949 --
	from bs4 import BeautifulSoup
	import urllib2
	import re

	# print soup.original_encoding

	# get all certification link
	url = "http://www.hrd.go.kr/jsp/HRDP/HRDP300/HRDP310/HRDP311/HRDP311_2List.jsp?keco_cd=&keco_nm=&orderField=jm_nm&orderDir=asc&pageNo=1&jm_cd=&kecoM=&kecoS=&srchKey=&rowsPerPage=1000"
	contents = urllib2.urlopen(url).read().decode('cp949', 'ignore')
	# soup = BeautifulSoup(contents.decode('euc-kr').encode('cp949'))
	soup = BeautifulSoup(contents)
	all_link = soup.find_all("a", attrs={"href" : "#tmp"})

	# f = file("certlink.csv", "w")
	f = file("cert.csv", "w")
	cert_basic_url = "http://www.hrd.go.kr/jsp/HRDP/popup/popLicenseView.jsp?jm_cd="

	for i in range(len(all_link)):
	link_index = str(all_link[i].get('onclick'))
	if re.match('^viewLicDetail', link_index):
	# get cert index number from url
	number = re.search('\\d\\d\\d\\d', link_index).group()
	url = "http://www.hrd.go.kr/jsp/HRDP/popup/popLicenseView.jsp?jm_cd=" + number
	contents = urllib2.urlopen(url).read().decode('cp949', 'ignore')
	soup = BeautifulSoup(contents)
	result = soup.find_all("h2", {"class", "tit", "bgNone", "pLNo"})[0]
	title = result.get_text()

	# if valid page
	if title != " ":
	## get cert description text
	desc_tab = soup.find(id="tabReview01")
	title_desc_soup = desc_tab.find("p")
	title_desc = "None"

	if title_desc_soup:
	title_desc = title_desc_soup.text
	title_desc = title_desc.strip()
	else:
	pass

	## get exam info
	exam_tab = soup.find(id="tabReview02")

	if exam_tab:
	table_header = exam_tab.find_all("th")
	header_count = len(table_header)
	table_body = exam_tab.find_all("td")
	valid_body_length = len(table_body) / header_count - 1
	for i in range(valid_body_length):
	text = "\"" + title.strip() + "\","
	text += "\"" + title_desc.strip() + "\","
	text += "\"" + table_body[i*header_count + 1].text.strip() + "\","
	text += "\"" + table_body[i*header_count + 2].text.strip() + "\", "
	text += "\"" + table_body[i*header_count + 3].text.strip() + "\", "
	text += "\"" + table_body[i*header_count + 4].text.strip() + "\"\n"
	print text.encode('mbcs')
	f.write(text.encode('mbcs'))
	else:
	pass
	else:
	pass

	f.close()