Skip to content

Instantly share code, notes, and snippets.

@meisheep
Created May 12, 2017 13:16
Show Gist options
  • Save meisheep/ed5585f05422050f170e5ad6346facc8 to your computer and use it in GitHub Desktop.
Save meisheep/ed5585f05422050f170e5ad6346facc8 to your computer and use it in GitHub Desktop.
Scrape UDN library by using Selenium and Python3
import sys
from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
keyword = None
if len(sys.argv) > 1:
keyword = sys.argv[1]
else:
print('Usage: python3 udn.py <keyword>')
sys.exit(1)
driver = webdriver.Chrome()
actions = ActionChains(driver)
url = 'http://udndata.com/library/'
# search
driver.get(url)
driver.switch_to.frame(driver.find_element_by_css_selector('frame:last-child'))
driver.find_element_by_id('SearchString').send_keys(keyword)
Select(driver.find_element_by_id('s1')).select_by_value('-1')
driver.find_element_by_css_selector('input[name=d2]').click()
driver.find_element_by_css_selector('input[name=d6]').click()
driver.find_element_by_css_selector('input[name=d9]').click()
Select(driver.find_element_by_css_selector('select[name=sharepage]')).select_by_value('50')
Select(driver.find_element_by_css_selector('select[name=select]')).select_by_value('0')
driver.find_element_by_css_selector('input[name=submit]').click()
# get links of each page
links = list()
has_next = True
while has_next:
links.extend([elm.get_attribute('href') for elm in driver.find_elements_by_xpath('//a[@name]//td[@class=\'title02\']/a')])
next_page = driver.find_elements_by_xpath('//a[contains(text(), "下一頁")]')
if len(next_page) == 0:
has_next = False
else:
print('Links#:', len(links))
next_page[0].click()
# get content of each post
f = open('{}.txt'.format(keyword), 'w')
for idx, link in enumerate(links):
print('Getting #{}'.format(idx))
driver.get(link)
title = ' - '.join([elm.text.strip() for elm in driver.find_elements_by_class_name('story_title')])
content = driver.find_element_by_class_name('story').text.strip()
f.write('{}\n\n{}\n\n-\n\n'.format(title, content))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment