Created
December 4, 2017 11:43
-
-
Save datalater/eb70a4f141e95e296309fb15d64cadd4 to your computer and use it in GitHub Desktop.
crawling-for-ohak-ver01.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import time | |
from selenium import webdriver | |
from selenium.webdriver.common.action_chains import ActionChains | |
from selenium.webdriver.common.keys import Keys | |
from selenium.webdriver.support import expected_conditions as EC | |
from selenium.webdriver.common.by import By | |
from selenium.webdriver.support.ui import WebDriverWait | |
import getpass | |
from bs4 import BeautifulSoup | |
print("(1/4) URL에 접속합니다...") | |
page_url = 'https://www.open.go.kr/search/theme/theme.do?themecd=00034' | |
driver = webdriver.Chrome(r'chromedriver.exe') | |
driver.get(page_url) | |
bokji = driver.find_element_by_xpath("//*[@id='content']/div[2]/div[1]/div/ul/li[2]/a") | |
ActionChains(driver).click(bokji).perform() | |
print("(2/4) 복지를 클릭합니다...") | |
time.sleep(2.0) | |
html = driver.page_source | |
soup = BeautifulSoup(html, 'lxml') | |
# total number of documents | |
total_document = soup.select("#result_total_impl strong")[0].get_text() | |
total_document = total_document.replace(",","") | |
total_document = int(total_document) | |
print("전체 문서의 개수는 %d개입니다." % (total_document)) | |
# total number of pages | |
total_pages = total_document//10 | |
print("전체 페이지 수는 %d페이지입니다." % (total_pages)) | |
delay = 10 | |
wait = WebDriverWait(driver, delay) | |
element = wait.until(EC.element_to_be_clickable((By.ID, 'result_false'))) | |
print("5초 딜레이를 시작합니다....") | |
time.sleep(5.0) | |
for page in range(total_pages): | |
for i in range(10): | |
time.sleep(5.0) | |
wonmun_xpath = "//*[@id='result_false']/tr[" + str(i+1) + "]/td[1]/a" | |
wonmun_before = driver.find_element_by_xpath(wonmun_xpath) | |
ActionChains(driver).click(wonmun_before).perform() | |
print("2초 딜레이를 시작합니다...") | |
time.sleep(2.0) | |
################################ | |
# beaultiful soup change # | |
################################ | |
html = driver.page_source | |
soup = BeautifulSoup(html, 'lxml') | |
a = soup.find_all("a", attrs={"class": "btn_icoStyle"}) | |
c = a[0]['onclick'] | |
print(type(a[0]['onclick'])) | |
c = c.replace("Y","N") | |
a[0]['onclick'] = c | |
print(a[0]['onclick']) | |
print(type(a[0])) | |
wonmun_down = driver.find_element_by_xpath('//*[@id="infoContent"]/tbody/tr[8]/td/p[1]/a') | |
ActionChains(driver).click(wonmun_down).perform() | |
print("(4/4) 다운로드 되었습니다...") | |
print("15초 딜레이를 시작합니다...") | |
time.sleep(15.0) | |
wonmun_back = driver.find_element_by_xpath('//*[@id="btnArea"]/span/a') | |
ActionChains(driver).click(wonmun_back).perform() | |
time.sleep(2.0) | |
page_xpath = '//*[@id="navi_page_false"]/a['+ str(page+2) + ']' | |
page_button = driver.find_element_by_xpath('//*[@id="navi_page_false"]/a[2]') | |
print("현재 페이지는 %d페이지입니다" % (page+2)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment