Skip to content

Instantly share code, notes, and snippets.

@jrraymond
Last active August 29, 2015 14:02
Show Gist options
  • Save jrraymond/e4a95147a10e10869ad8 to your computer and use it in GitHub Desktop.
Save jrraymond/e4a95147a10e10869ad8 to your computer and use it in GitHub Desktop.
wescraper
import time, string
from scrapy.spider import BaseSpider
from scrapy.http import FormRequest
from scrapy.item import Item, Field
from scrapy.selector import HtmlXPathSelector
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
#needs credentials
WESLEYAN_USERNAME = ''
WESLEYAN_PASSWORD = ''
class Student(Item):
lastName = Field()
firstName = Field()
email = Field()
year = Field()
class WesSpider(BaseSpider):
name = "wes"
allowed_domains = ["wesleyan.edu"]
start_urls = ["https://sso.wesleyan.edu/login?service=https://wesep.wesleyan.edu/cgi-perl/session.cgi"]
def __init__(self):
self.driver = webdriver.Firefox()
def parse(self, response):
self.driver.get(response.url)
self.driver.find_element_by_id("username").send_keys(WESLEYAN_USERNAME)
self.driver.find_element_by_id("password").send_keys(WESLEYAN_PASSWORD)
self.driver.find_element_by_class_name("btn-submit").click()
self.driver.get("https://iasext.wesleyan.edu/directory_int/f?p=146:17:::::APEX_STRING:"+USERNAME+":")
for c in string.ascii_lowercase:
for c2 in string.ascii_lowercase:
#self.driver.find_element_by_id('P17_REPORT_SEARCH').send_keys(c) #for faculty
#self.driver.find_element_by_xpath('//*[@id="apex_layout_171559622667204771"]/tbody/tr[2]/td[4]/table/tbody/tr/td[2]/a').click()
self.driver.find_element_by_id('P17_STUDENT_REPORT_SEARCH').send_keys(c + c2) #for students
self.driver.find_element_by_xpath('//*[@id="apex_layout_160715889402661227"]/tbody/tr/td[6]/table/tbody/tr/td[2]/a').click()
for p in range(0,9):
nextPage = '//*[@id="R160725883139767722"]/tbody/tr[2]/td[2]/table[4]/tbody/tr[52]/td/table/tbody/tr/td[3]/a['+str(p+1)+']'
for r in range(2,52):
if self.existsXpath(self.getCellXpath(r,3)):
student = Student()
student['lastName'] = self.driver.find_element_by_xpath(self.getCellXpath(r,3)).get_attribute('innerHTML')
student['firstName'] = self.driver.find_element_by_xpath(self.getCellXpath(r,4)).get_attribute('innerHTML')
student['email'] = self.driver.find_element_by_xpath(self.getCellXpath(r,6)).get_attribute('innerHTML')
student['year'] = self.driver.find_element_by_xpath(self.getCellXpath(r,7)).get_attribute('innerHTML')
print student
yield student
else:
break
if self.existsXpath(nextPage):
self.driver.find_element_by_xpath(nextPage).click()
elif p > 0:
break
self.driver.find_element_by_xpath('//*[@id="wwvFlowForm"]/table[2]/tbody/tr[3]/td/a').click()
self.driver.quit()
def getCellXpath(self,row,col):
if col == 6:
return '//*[@id="R160725883139767722"]/tbody/tr[2]/td[2]/table[4]/tbody/tr['+str(row)+']/td[6]/a'
else:
return '//*[@id="R160725883139767722"]/tbody/tr[2]/td[2]/table[4]/tbody/tr['+str(row)+']/td['+str(col)+']/span'
def existsXpath(self,xpath):
try:
self.driver.find_element_by_xpath(xpath)
except NoSuchElementException:
return False
return True
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment