from scrapy.xlib.pydispatch import dispatcher
from scrapy import signals
from scrapy.http import FormRequest
from scrapy.selector import Selector
from spider.spiders.basic import StudentSpider
from spider.items import StudentItem
from scrapy import log
class XPATHS:
STUDENTS_ROWS = '//*[@id="results"]/center/table/tbody/tr[3]'
STUDENT_NAME = './td/form/input[@name="submit"]/@value'
STUDENT_EMAIL = './td/form/input[@id="email"]/@value'
STUDENT_STATUS = './td/form/input[@id="rolestring"]/@value'
class SamSpider(StudentSpider):
name = 'sam'
start_urls = ['']
def __init__(self, *args, **kwargs):
super(SamSpider, self).__init__(*args, **kwargs)
self.filter_role = kwargs.get('filter_role', '')
dispatcher.connect(self.idle, signals.spider_idle)
def idle(self, spider):
if "progress_total" in self.state:
self.state['progress_current'] = self.state['progress_total']
def parse(self, response):
phrases = self.get_search_phrases()
self.state['progress_current'] = 0
self.state['progress_total'] = len(phrases)
for phrase in phrases:
yield self.get_search_request(response, phrase)
def get_search_request(self, response, phrase):
return FormRequest.from_response(response,
formxpath = '//*[@id="search"]',
'searchChar': phrase
def people(self, response):
def lget(x, index, default):
return x[index].strip() if len(x) > index else default
self.state['progress_current'] += 1
sel = Selector(response)
students = sel.xpath(XPATHS.STUDENTS_ROWS)
# print(sel.xpath('//*[@class="content"]').extract())
for student in students:
yield StudentItem(
name=lget(student.xpath(XPATHS.STUDENT_NAME).extract(), 0, ''),
email=lget(student.xpath(XPATHS.STUDENT_EMAIL).extract(), 0, ''),
enrollment=lget(student.xpath(XPATHS.STUDENT_STATUS).extract(), 0, '')
