Skip to content

Instantly share code, notes, and snippets.

@datalater
Created December 4, 2017 11:43
Show Gist options
  • Save datalater/eb70a4f141e95e296309fb15d64cadd4 to your computer and use it in GitHub Desktop.
Save datalater/eb70a4f141e95e296309fb15d64cadd4 to your computer and use it in GitHub Desktop.
crawling-for-ohak-ver01.py
import time
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
import getpass
from bs4 import BeautifulSoup
print("(1/4) URL에 접속합니다...")
page_url = 'https://www.open.go.kr/search/theme/theme.do?themecd=00034'
driver = webdriver.Chrome(r'chromedriver.exe')
driver.get(page_url)
bokji = driver.find_element_by_xpath("//*[@id='content']/div[2]/div[1]/div/ul/li[2]/a")
ActionChains(driver).click(bokji).perform()
print("(2/4) 복지를 클릭합니다...")
time.sleep(2.0)
html = driver.page_source
soup = BeautifulSoup(html, 'lxml')
# total number of documents
total_document = soup.select("#result_total_impl strong")[0].get_text()
total_document = total_document.replace(",","")
total_document = int(total_document)
print("전체 문서의 개수는 %d개입니다." % (total_document))
# total number of pages
total_pages = total_document//10
print("전체 페이지 수는 %d페이지입니다." % (total_pages))
delay = 10
wait = WebDriverWait(driver, delay)
element = wait.until(EC.element_to_be_clickable((By.ID, 'result_false')))
print("5초 딜레이를 시작합니다....")
time.sleep(5.0)
for page in range(total_pages):
for i in range(10):
time.sleep(5.0)
wonmun_xpath = "//*[@id='result_false']/tr[" + str(i+1) + "]/td[1]/a"
wonmun_before = driver.find_element_by_xpath(wonmun_xpath)
ActionChains(driver).click(wonmun_before).perform()
print("2초 딜레이를 시작합니다...")
time.sleep(2.0)
################################
# beaultiful soup change #
################################
html = driver.page_source
soup = BeautifulSoup(html, 'lxml')
a = soup.find_all("a", attrs={"class": "btn_icoStyle"})
c = a[0]['onclick']
print(type(a[0]['onclick']))
c = c.replace("Y","N")
a[0]['onclick'] = c
print(a[0]['onclick'])
print(type(a[0]))
wonmun_down = driver.find_element_by_xpath('//*[@id="infoContent"]/tbody/tr[8]/td/p[1]/a')
ActionChains(driver).click(wonmun_down).perform()
print("(4/4) 다운로드 되었습니다...")
print("15초 딜레이를 시작합니다...")
time.sleep(15.0)
wonmun_back = driver.find_element_by_xpath('//*[@id="btnArea"]/span/a')
ActionChains(driver).click(wonmun_back).perform()
time.sleep(2.0)
page_xpath = '//*[@id="navi_page_false"]/a['+ str(page+2) + ']'
page_button = driver.find_element_by_xpath('//*[@id="navi_page_false"]/a[2]')
print("현재 페이지는 %d페이지입니다" % (page+2))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment