Created
June 11, 2017 01:05
-
-
Save raiderrobert/a56c834a43e389f82196a2f2c826d44c to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
""" | |
Created on Wed May 24 14:40:50 2017 | |
Requirements: | |
1. Read URL | |
2. Loop function to crawl through the following pages after the first page in the data catalogue | |
2. For every data node, go inside and crawl the resource links | |
3. Test each link for 404 errors | |
4. Return result into a pandas dataframe | |
""" | |
import scrapy | |
class DatasetSpider(scrapy.Spider): | |
name = 'spider' | |
start_urls = ['https://catalog.data.gov/dataset?page={x}'.format(x=x)for x in range(1, 10)] | |
def parse(self, response): | |
for title in response.css('h3.dataset-heading'): | |
url = title.css('a::attr(href)').extract_first() | |
request = scrapy.Request('https://catalog.data.gov' + url, callback=self.parse_page2) | |
request.meta['title'] = title.css('a::text').extract_first() | |
request.meta['url'] = url | |
yield request | |
def parse_page2(self, response): | |
resp = { | |
'title': response.meta['title'], | |
'url': response.meta['url'], | |
'resources': [] | |
} | |
for title in response.css('ul.resource-list>li>a'): | |
resp['resources'].append({ | |
'resource': title.css('a::text').extract_first(), | |
'resource_link': title.css('a::attr(href)').extract_first() | |
}) | |
yield resp | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment