Last active
August 29, 2015 14:07
-
-
Save stav/0eabd0ead46c5900de50 to your computer and use it in GitHub Desktop.
Scrapy spider with stream-lined routing and Item Loader processing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Routed Crawler | |
""" | |
class Route(dict): | |
"""Spider route request""" | |
pass | |
class Router(scrapy.Spider): | |
"""Spider routing and loader handling""" | |
def __init__(self, loader=scrapy.items.ItemLoader): | |
self.routes = list() | |
self.loader = loader() | |
def add_route(self, **kwargs): | |
self.routes.append(Route(**kwargs)) | |
class RouteSpider(scrapy.Spider): | |
"""Spider with request routing and loader handling""" | |
def _route_request(self, route, router=None): | |
if router is None: | |
router = Router() | |
meta = dict(router=router) | |
if 'execute' in route: | |
meta['execute'] = route['execute'] | |
del route['execute'] | |
return scrapy.Request(callback=self.route, meta=meta, **route) | |
def parse(self, response): | |
routes = self.start_routes(response) | |
for route in routes: | |
yield self._route_request(route) | |
def start_routes(self, response): | |
raise NotImplementedError | |
def route(self, response): | |
router = response.meta.get('router') | |
function = response.meta.get('execute') | |
# Loader Context magic for new response | |
loader = router.loader | |
loader.selector = loader.default_selector_class(response) | |
loader.context.update(selector=loader.selector, response=response) | |
# Run the executable | |
response.router = router | |
function(response) | |
# If we have more routes then pop one off the stack and yield it | |
while router.routes: | |
route = router.routes.pop(0) | |
if route.get('url'): | |
yield self._route_request(route, router=router) | |
break | |
# otherwise yield the loader item and we're done | |
else: | |
yield router.loader.load_item() | |
class MySpider(RouteSpider): | |
name = "example" | |
start_urls = ['http://www.example.com/'] | |
def start_routes(self, response): | |
""" | |
Parse the list of centers. | |
""" | |
for city in response.css('#location .centers'): | |
url = city.xpath('a/@href') | |
yield Route(url=url, execute=self.parse_center) | |
def parse_center(self, response): | |
""" | |
Parse a center. | |
""" | |
router = response.router | |
# Add to loader | |
router.loader.add_css('name', '#block-main .content-banner .module') | |
router.loader.add_css('description', '#system') | |
# Calc new routes | |
conta_url = response.css('#contact') | |
media_url = response.css('#media') | |
# Add new routes | |
router.add_route(url=conta_url, execute=self.parse_center_contact) | |
router.add_route(url=media_url, execute=self.parse_center_media) | |
def parse_center_contact(self, response): | |
""" | |
Parse a center contact page. | |
""" | |
loader = response.router.loader | |
# Add more to the same persistent loader | |
email_re = r'\b[\w.%+-]+@[\w.-]+\.[A-Za-z]{2,4}\b' | |
loader.add_xpath('email', 'p', re=email_re) | |
def parse_center_media(self, response): | |
""" | |
Parse a center media library page. | |
""" | |
loader = response.router.loader | |
# Add more to the loader | |
for video in response.css('iframe'): | |
loader.add_xpath('videos', '@src') | |
# If this is the last route in the router, which in this example it is, | |
# then after this method executes the router will yield its contents. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment