-
-
Save anonymous/da1b487b2d7790b6f954 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Define here the models for your scraped items | |
# | |
# See documentation in: | |
# http://doc.scrapy.org/topics/items.html | |
from scrapy.item import Item, Field | |
class ParishItem(Item): | |
# define the fields for your item here like: | |
# name = Field() | |
legal_name = Field() | |
archdeaconry = Field() | |
archdeaconry_id = Field() | |
deanery = Field() | |
deanery_id = Field() | |
parish_id = Field() | |
benefice = Field() | |
benefice_id = Field() | |
pass | |
class DioceseItem(Item): | |
# define the fields for your item here like: | |
# name = Field() | |
diocese_name = Field() | |
diocese_id = Field() | |
pass |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from scrapy import log | |
class CustomDownloaderMiddleware(object): | |
def process_request(self, request, spider): | |
log.msg("### Process Request Middleware " + request.url, level=log.WARNING) | |
current_id = getattr(spider, 'current_id') | |
log.msg("### Process Request Middleware %d" % (current_id)) | |
if request.url == "http://www.achurchnearyou.com//": | |
log.msg("### Process Request Middleware - Got redirected to home - creating new request") | |
setattr(spider, 'current_id', current_id + 1) | |
return request.replace(url="http://www.achurchnearyou.com/venue.php?V=%04d" % (current_id + 1)) | |
# def process_response(self, request, response, spider): | |
# log.msg("In Middleware " + response.url + " " + request.url, level=log.WARNING) | |
# if response.url == "http://www.achurchnearyou.com//": | |
# raise IgnoreRequest("Ignored as redirected to home page! #ROBIN") | |
# else: | |
# return response | |
# return response |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from scrapy.spider import BaseSpider | |
from scrapy.http import Request | |
from scrapy.selector import HtmlXPathSelector | |
from acny.items import ParishItem | |
class ChurchSpider(BaseSpider): | |
name = "church" | |
allowed_domains = ["achurchnearyou.com"] | |
start_urls = ["http://www.achurchnearyou.com/venue.php?V=00001"] | |
current_id = 63 | |
def parse(self, response): | |
current_id = getattr(self, 'current_id') | |
hxs = HtmlXPathSelector(response) | |
item = ParishItem() | |
item['archdeaconry'] = hxs.select("//a[starts-with(@href,'/archdeaconry/')]/text()").extract()[0] | |
item['archdeaconry_id'] = hxs.select("//a[starts-with(@href,'/archdeaconry/')]/@href").extract()[0].split("/")[2] | |
item['deanery'] = hxs.select("//a[starts-with(@href,'/deanery/')]/text()").extract()[0] | |
item['deanery_id'] = hxs.select("//a[starts-with(@href,'/deanery/')]/@href").extract()[0].split("/")[2] | |
item['legal_name'] = hxs.select("//a[starts-with(@href,'/parish/')]/text()").extract()[0] | |
item['parish_id'] = hxs.select("//a[starts-with(@href,'/parish/')]/@href").extract()[0].split("/")[2] | |
item['benefice'] = hxs.select("//a[starts-with(@href,'/benefice/')]/text()").extract()[0] | |
item['benefice_id'] = hxs.select("//a[starts-with(@href,'/benefice/')]/@href").extract()[0].split("/")[2] | |
setattr(self, 'current_id', current_id + 1) | |
next_url = "http://www.achurchnearyou.com/venue.php?V=%04d" % (current_id + 1) | |
req = Request(next_url, callback=self.parse) | |
return [item, req] | |
def parse_details(self, response): | |
hxs = HtmlXPathSelector(response) | |
title = hxs.select("//h1/text()").extract()[0] | |
date = hxs.select('//p[@class="date"]/text()').extract()[0].strip() | |
desc_ps = hxs.select("//div[@id='main_content' and @class='col six float single']/p/text()").extract() | |
desc = "\n".join(desc_ps) | |
return ParishItem(name=title, date=date, desc=desc, url=response.url) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment