Created
June 25, 2016 00:25
-
-
Save domspad/2aa14c3686589bbf5ea2e3c156afa8d6 to your computer and use it in GitHub Desktop.
A script using Selenium to automate the scraping of discussion threads on Project Euler
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
""" | |
A script to automate the scraping of Project Euler (www.projecteuler.net) discussion threads. | |
Requires a username and password associated with an account that has solved the problems associated with the desired threads. | |
""" | |
import contextlib | |
import selenium.webdriver as webdriver | |
import selenium.webdriver.support.ui as ui | |
def save_html(html,fname): | |
with open(fname,'w') as f: | |
f.write(html.encode('utf-8')) | |
username=#<USERNAME HERE> | |
password=#<PASSWORD HERE> | |
with contextlib.closing(webdriver.Chrome()) as driver: | |
# sign in | |
driver.get('https://www.projecteuler.net/sign_in') | |
wait = ui.WebDriverWait(driver, 10) # timeout after 10 seconds | |
# manually enter the captcha | |
captcha = raw_input() | |
usnEl = driver.find_element_by_name('username') | |
usnEl.send_keys(username) | |
pswdEl = driver.find_element_by_name('password') | |
pswdEl.send_keys(password) | |
inputElement = driver.find_element_by_name('captcha') | |
inputElement.send_keys(captcha) | |
button = driver.find_element_by_name('sign_in') | |
button.click() | |
# get threads! | |
for prob_num in xrange(87,101): | |
prog_button = driver.find_element_by_xpath('//*[@id="nav"]/ul/li[4]/a') | |
prog_button.click() | |
wait = ui.WebDriverWait(driver, 5) | |
problem_link = driver.find_element_by_link_text(str(prob_num)) | |
problem_link.click() | |
wait = ui.WebDriverWait(driver, 3) | |
thread_link = driver.find_element_by_link_text('problem {}'.format(prob_num)) | |
thread_link.click() | |
wait = ui.WebDriverWait(driver, 3) | |
#FIXME - hardcoded num of thread pages | |
for i in xrange(1,9): | |
page_link = driver.find_element_by_link_text(str(i)) | |
page_link.click() | |
wait = ui.WebDriverWait(driver, 3) | |
html = unicode(driver.page_source) | |
print prob_num, i | |
fname = '{}-{}.html'.format(prob_num,i) | |
save_html(html,fname) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment