Skip to content

Instantly share code, notes, and snippets.

@AlexDel
Created November 10, 2014 09:27
Show Gist options
  • Save AlexDel/67c993556e897039c871 to your computer and use it in GitHub Desktop.
Save AlexDel/67c993556e897039c871 to your computer and use it in GitHub Desktop.
from scrapy.xlib.pydispatch import dispatcher
from scrapy import signals
from scrapy.http import FormRequest
from scrapy.selector import Selector
from spider.spiders.basic import StudentSpider
from spider.items import StudentItem
from scrapy import log
class XPATHS:
STUDENTS_ROWS = '//*[@id="results"]/center/table/tbody/tr[3]'
STUDENT_NAME = './td/form/input[@name="submit"]/@value'
STUDENT_EMAIL = './td/form/input[@id="email"]/@value'
STUDENT_STATUS = './td/form/input[@id="rolestring"]/@value'
class SamSpider(StudentSpider):
name = 'sam'
start_urls = ['https://www2.samford.edu/onlineDir/index.php']
def __init__(self, *args, **kwargs):
super(SamSpider, self).__init__(*args, **kwargs)
self.filter_role = kwargs.get('filter_role', '')
dispatcher.connect(self.idle, signals.spider_idle)
def idle(self, spider):
if "progress_total" in self.state:
self.state['progress_current'] = self.state['progress_total']
def parse(self, response):
phrases = self.get_search_phrases()
self.state['progress_current'] = 0
self.state['progress_total'] = len(phrases)
for phrase in phrases:
yield self.get_search_request(response, phrase)
def get_search_request(self, response, phrase):
return FormRequest.from_response(response,
formxpath = '//*[@id="search"]',
formdata={
'searchChar': phrase
},
callback=self.people
)
def people(self, response):
def lget(x, index, default):
return x[index].strip() if len(x) > index else default
self.state['progress_current'] += 1
sel = Selector(response)
students = sel.xpath(XPATHS.STUDENTS_ROWS)
# print(sel.xpath('//*[@class="content"]').extract())
for student in students:
yield StudentItem(
name=lget(student.xpath(XPATHS.STUDENT_NAME).extract(), 0, ''),
email=lget(student.xpath(XPATHS.STUDENT_EMAIL).extract(), 0, ''),
enrollment=lget(student.xpath(XPATHS.STUDENT_STATUS).extract(), 0, '')
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment