Created
May 12, 2017 13:16
-
-
Save meisheep/ed5585f05422050f170e5ad6346facc8 to your computer and use it in GitHub Desktop.
Scrape UDN library by using Selenium and Python3
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
from selenium import webdriver | |
from selenium.webdriver import ActionChains | |
from selenium.webdriver.common.keys import Keys | |
from selenium.webdriver.support.ui import Select | |
keyword = None | |
if len(sys.argv) > 1: | |
keyword = sys.argv[1] | |
else: | |
print('Usage: python3 udn.py <keyword>') | |
sys.exit(1) | |
driver = webdriver.Chrome() | |
actions = ActionChains(driver) | |
url = 'http://udndata.com/library/' | |
# search | |
driver.get(url) | |
driver.switch_to.frame(driver.find_element_by_css_selector('frame:last-child')) | |
driver.find_element_by_id('SearchString').send_keys(keyword) | |
Select(driver.find_element_by_id('s1')).select_by_value('-1') | |
driver.find_element_by_css_selector('input[name=d2]').click() | |
driver.find_element_by_css_selector('input[name=d6]').click() | |
driver.find_element_by_css_selector('input[name=d9]').click() | |
Select(driver.find_element_by_css_selector('select[name=sharepage]')).select_by_value('50') | |
Select(driver.find_element_by_css_selector('select[name=select]')).select_by_value('0') | |
driver.find_element_by_css_selector('input[name=submit]').click() | |
# get links of each page | |
links = list() | |
has_next = True | |
while has_next: | |
links.extend([elm.get_attribute('href') for elm in driver.find_elements_by_xpath('//a[@name]//td[@class=\'title02\']/a')]) | |
next_page = driver.find_elements_by_xpath('//a[contains(text(), "下一頁")]') | |
if len(next_page) == 0: | |
has_next = False | |
else: | |
print('Links#:', len(links)) | |
next_page[0].click() | |
# get content of each post | |
f = open('{}.txt'.format(keyword), 'w') | |
for idx, link in enumerate(links): | |
print('Getting #{}'.format(idx)) | |
driver.get(link) | |
title = ' - '.join([elm.text.strip() for elm in driver.find_elements_by_class_name('story_title')]) | |
content = driver.find_element_by_class_name('story').text.strip() | |
f.write('{}\n\n{}\n\n-\n\n'.format(title, content)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment