Skip to content

Instantly share code, notes, and snippets.

@stav
Last active August 29, 2015 14:07
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save stav/0eabd0ead46c5900de50 to your computer and use it in GitHub Desktop.
Save stav/0eabd0ead46c5900de50 to your computer and use it in GitHub Desktop.
Scrapy spider with stream-lined routing and Item Loader processing
"""
Routed Crawler
"""
class Route(dict):
"""Spider route request"""
pass
class Router(scrapy.Spider):
"""Spider routing and loader handling"""
def __init__(self, loader=scrapy.items.ItemLoader):
self.routes = list()
self.loader = loader()
def add_route(self, **kwargs):
self.routes.append(Route(**kwargs))
class RouteSpider(scrapy.Spider):
"""Spider with request routing and loader handling"""
def _route_request(self, route, router=None):
if router is None:
router = Router()
meta = dict(router=router)
if 'execute' in route:
meta['execute'] = route['execute']
del route['execute']
return scrapy.Request(callback=self.route, meta=meta, **route)
def parse(self, response):
routes = self.start_routes(response)
for route in routes:
yield self._route_request(route)
def start_routes(self, response):
raise NotImplementedError
def route(self, response):
router = response.meta.get('router')
function = response.meta.get('execute')
# Loader Context magic for new response
loader = router.loader
loader.selector = loader.default_selector_class(response)
loader.context.update(selector=loader.selector, response=response)
# Run the executable
response.router = router
function(response)
# If we have more routes then pop one off the stack and yield it
while router.routes:
route = router.routes.pop(0)
if route.get('url'):
yield self._route_request(route, router=router)
break
# otherwise yield the loader item and we're done
else:
yield router.loader.load_item()
class MySpider(RouteSpider):
name = "example"
start_urls = ['http://www.example.com/']
def start_routes(self, response):
"""
Parse the list of centers.
"""
for city in response.css('#location .centers'):
url = city.xpath('a/@href')
yield Route(url=url, execute=self.parse_center)
def parse_center(self, response):
"""
Parse a center.
"""
router = response.router
# Add to loader
router.loader.add_css('name', '#block-main .content-banner .module')
router.loader.add_css('description', '#system')
# Calc new routes
conta_url = response.css('#contact')
media_url = response.css('#media')
# Add new routes
router.add_route(url=conta_url, execute=self.parse_center_contact)
router.add_route(url=media_url, execute=self.parse_center_media)
def parse_center_contact(self, response):
"""
Parse a center contact page.
"""
loader = response.router.loader
# Add more to the same persistent loader
email_re = r'\b[\w.%+-]+@[\w.-]+\.[A-Za-z]{2,4}\b'
loader.add_xpath('email', 'p', re=email_re)
def parse_center_media(self, response):
"""
Parse a center media library page.
"""
loader = response.router.loader
# Add more to the loader
for video in response.css('iframe'):
loader.add_xpath('videos', '@src')
# If this is the last route in the router, which in this example it is,
# then after this method executes the router will yield its contents.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment