Skip to content

Instantly share code, notes, and snippets.

@domspad
Created June 25, 2016 00:25
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save domspad/2aa14c3686589bbf5ea2e3c156afa8d6 to your computer and use it in GitHub Desktop.
Save domspad/2aa14c3686589bbf5ea2e3c156afa8d6 to your computer and use it in GitHub Desktop.
A script using Selenium to automate the scraping of discussion threads on Project Euler
#!/usr/bin/env python
"""
A script to automate the scraping of Project Euler (www.projecteuler.net) discussion threads.
Requires a username and password associated with an account that has solved the problems associated with the desired threads.
"""
import contextlib
import selenium.webdriver as webdriver
import selenium.webdriver.support.ui as ui
def save_html(html,fname):
with open(fname,'w') as f:
f.write(html.encode('utf-8'))
username=#<USERNAME HERE>
password=#<PASSWORD HERE>
with contextlib.closing(webdriver.Chrome()) as driver:
# sign in
driver.get('https://www.projecteuler.net/sign_in')
wait = ui.WebDriverWait(driver, 10) # timeout after 10 seconds
# manually enter the captcha
captcha = raw_input()
usnEl = driver.find_element_by_name('username')
usnEl.send_keys(username)
pswdEl = driver.find_element_by_name('password')
pswdEl.send_keys(password)
inputElement = driver.find_element_by_name('captcha')
inputElement.send_keys(captcha)
button = driver.find_element_by_name('sign_in')
button.click()
# get threads!
for prob_num in xrange(87,101):
prog_button = driver.find_element_by_xpath('//*[@id="nav"]/ul/li[4]/a')
prog_button.click()
wait = ui.WebDriverWait(driver, 5)
problem_link = driver.find_element_by_link_text(str(prob_num))
problem_link.click()
wait = ui.WebDriverWait(driver, 3)
thread_link = driver.find_element_by_link_text('problem {}'.format(prob_num))
thread_link.click()
wait = ui.WebDriverWait(driver, 3)
#FIXME - hardcoded num of thread pages
for i in xrange(1,9):
page_link = driver.find_element_by_link_text(str(i))
page_link.click()
wait = ui.WebDriverWait(driver, 3)
html = unicode(driver.page_source)
print prob_num, i
fname = '{}-{}.html'.format(prob_num,i)
save_html(html,fname)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment