Created
November 10, 2014 09:27
-
-
Save AlexDel/67c993556e897039c871 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from scrapy.xlib.pydispatch import dispatcher | |
from scrapy import signals | |
from scrapy.http import FormRequest | |
from scrapy.selector import Selector | |
from spider.spiders.basic import StudentSpider | |
from spider.items import StudentItem | |
from scrapy import log | |
class XPATHS: | |
STUDENTS_ROWS = '//*[@id="results"]/center/table/tbody/tr[3]' | |
STUDENT_NAME = './td/form/input[@name="submit"]/@value' | |
STUDENT_EMAIL = './td/form/input[@id="email"]/@value' | |
STUDENT_STATUS = './td/form/input[@id="rolestring"]/@value' | |
class SamSpider(StudentSpider): | |
name = 'sam' | |
start_urls = ['https://www2.samford.edu/onlineDir/index.php'] | |
def __init__(self, *args, **kwargs): | |
super(SamSpider, self).__init__(*args, **kwargs) | |
self.filter_role = kwargs.get('filter_role', '') | |
dispatcher.connect(self.idle, signals.spider_idle) | |
def idle(self, spider): | |
if "progress_total" in self.state: | |
self.state['progress_current'] = self.state['progress_total'] | |
def parse(self, response): | |
phrases = self.get_search_phrases() | |
self.state['progress_current'] = 0 | |
self.state['progress_total'] = len(phrases) | |
for phrase in phrases: | |
yield self.get_search_request(response, phrase) | |
def get_search_request(self, response, phrase): | |
return FormRequest.from_response(response, | |
formxpath = '//*[@id="search"]', | |
formdata={ | |
'searchChar': phrase | |
}, | |
callback=self.people | |
) | |
def people(self, response): | |
def lget(x, index, default): | |
return x[index].strip() if len(x) > index else default | |
self.state['progress_current'] += 1 | |
sel = Selector(response) | |
students = sel.xpath(XPATHS.STUDENTS_ROWS) | |
# print(sel.xpath('//*[@class="content"]').extract()) | |
for student in students: | |
yield StudentItem( | |
name=lget(student.xpath(XPATHS.STUDENT_NAME).extract(), 0, ''), | |
email=lget(student.xpath(XPATHS.STUDENT_EMAIL).extract(), 0, ''), | |
enrollment=lget(student.xpath(XPATHS.STUDENT_STATUS).extract(), 0, '') | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment