Skip to content

Instantly share code, notes, and snippets.

@raiderrobert
Created June 11, 2017 01:05
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save raiderrobert/a56c834a43e389f82196a2f2c826d44c to your computer and use it in GitHub Desktop.
Save raiderrobert/a56c834a43e389f82196a2f2c826d44c to your computer and use it in GitHub Desktop.
# -*- coding: utf-8 -*-
"""
Created on Wed May 24 14:40:50 2017
Requirements:
1. Read URL
2. Loop function to crawl through the following pages after the first page in the data catalogue
2. For every data node, go inside and crawl the resource links
3. Test each link for 404 errors
4. Return result into a pandas dataframe
"""
import scrapy
class DatasetSpider(scrapy.Spider):
name = 'spider'
start_urls = ['https://catalog.data.gov/dataset?page={x}'.format(x=x)for x in range(1, 10)]
def parse(self, response):
for title in response.css('h3.dataset-heading'):
url = title.css('a::attr(href)').extract_first()
request = scrapy.Request('https://catalog.data.gov' + url, callback=self.parse_page2)
request.meta['title'] = title.css('a::text').extract_first()
request.meta['url'] = url
yield request
def parse_page2(self, response):
resp = {
'title': response.meta['title'],
'url': response.meta['url'],
'resources': []
}
for title in response.css('ul.resource-list>li>a'):
resp['resources'].append({
'resource': title.css('a::text').extract_first(),
'resource_link': title.css('a::attr(href)').extract_first()
})
yield resp
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment