/items.py Secret

## items.py
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/topics/items.html

from scrapy.item import Item, Field

class ParishItem(Item):
    # define the fields for your item here like:
    # name = Field()
    legal_name = Field()
    archdeaconry = Field()
    archdeaconry_id = Field()
    deanery = Field()
    deanery_id = Field()
    parish_id = Field()
    benefice = Field()
    benefice_id = Field()
    pass

class DioceseItem(Item):
    # define the fields for your item here like:
    # name = Field()
    diocese_name = Field()
    diocese_id = Field()
    pass

## middlewares.py
from scrapy import log

class CustomDownloaderMiddleware(object):

  def process_request(self, request, spider):
		log.msg("### Process Request Middleware " + request.url, level=log.WARNING)
		current_id = getattr(spider, 'current_id')
		log.msg("### Process Request Middleware %d" % (current_id))

		if request.url == "http://www.achurchnearyou.com//":
			log.msg("### Process Request Middleware - Got redirected to home - creating new request")
			setattr(spider, 'current_id', current_id + 1)
			return request.replace(url="http://www.achurchnearyou.com/venue.php?V=%04d" % (current_id + 1))

	# def process_response(self, request, response, spider):
	# 	log.msg("In Middleware " + response.url + "  " + request.url, level=log.WARNING)
	# 	if response.url == "http://www.achurchnearyou.com//":
	# 		raise IgnoreRequest("Ignored as redirected to home page! #ROBIN")
	# 	else:
	# 		return response
	# 	return response

## parish.py
from scrapy.spider import BaseSpider
from scrapy.http import Request
from scrapy.selector import HtmlXPathSelector

from acny.items import ParishItem

class ChurchSpider(BaseSpider):
    name = "church"
    allowed_domains = ["achurchnearyou.com"]
    start_urls = ["http://www.achurchnearyou.com/venue.php?V=00001"]

    current_id = 63

    def parse(self, response):
      current_id = getattr(self, 'current_id')
        hxs = HtmlXPathSelector(response)

        item = ParishItem()

        item['archdeaconry'] = hxs.select("//a[starts-with(@href,'/archdeaconry/')]/text()").extract()[0]
        item['archdeaconry_id'] = hxs.select("//a[starts-with(@href,'/archdeaconry/')]/@href").extract()[0].split("/")[2]

        item['deanery'] = hxs.select("//a[starts-with(@href,'/deanery/')]/text()").extract()[0]
        item['deanery_id'] = hxs.select("//a[starts-with(@href,'/deanery/')]/@href").extract()[0].split("/")[2]

        item['legal_name'] = hxs.select("//a[starts-with(@href,'/parish/')]/text()").extract()[0]
        item['parish_id'] = hxs.select("//a[starts-with(@href,'/parish/')]/@href").extract()[0].split("/")[2]

        item['benefice'] = hxs.select("//a[starts-with(@href,'/benefice/')]/text()").extract()[0]
        item['benefice_id'] = hxs.select("//a[starts-with(@href,'/benefice/')]/@href").extract()[0].split("/")[2]

        setattr(self, 'current_id', current_id + 1)
        next_url = "http://www.achurchnearyou.com/venue.php?V=%04d" % (current_id + 1)
        req =  Request(next_url, callback=self.parse)

        return [item, req]

    def parse_details(self, response):
	    hxs = HtmlXPathSelector(response)

	    title = hxs.select("//h1/text()").extract()[0]
	    date = hxs.select('//p[@class="date"]/text()').extract()[0].strip()
	    desc_ps = hxs.select("//div[@id='main_content' and @class='col six float single']/p/text()").extract()
	    desc = "\n".join(desc_ps)

	    return ParishItem(name=title, date=date, desc=desc, url=response.url)
	# Define here the models for your scraped items
	#
	# See documentation in:
	# http://doc.scrapy.org/topics/items.html

	from scrapy.item import Item, Field

	class ParishItem(Item):
	# define the fields for your item here like:
	# name = Field()
	legal_name = Field()
	archdeaconry = Field()
	archdeaconry_id = Field()
	deanery = Field()
	deanery_id = Field()
	parish_id = Field()
	benefice = Field()
	benefice_id = Field()
	pass

	class DioceseItem(Item):
	# define the fields for your item here like:
	# name = Field()
	diocese_name = Field()
	diocese_id = Field()
	pass
	from scrapy import log

	class CustomDownloaderMiddleware(object):

	def process_request(self, request, spider):
	log.msg("### Process Request Middleware " + request.url, level=log.WARNING)
	current_id = getattr(spider, 'current_id')
	log.msg("### Process Request Middleware %d" % (current_id))

	if request.url == "http://www.achurchnearyou.com//":
	log.msg("### Process Request Middleware - Got redirected to home - creating new request")
	setattr(spider, 'current_id', current_id + 1)
	return request.replace(url="http://www.achurchnearyou.com/venue.php?V=%04d" % (current_id + 1))

	# def process_response(self, request, response, spider):
	# log.msg("In Middleware " + response.url + " " + request.url, level=log.WARNING)
	# if response.url == "http://www.achurchnearyou.com//":
	# raise IgnoreRequest("Ignored as redirected to home page! #ROBIN")
	# else:
	# return response
	# return response
	from scrapy.spider import BaseSpider
	from scrapy.http import Request
	from scrapy.selector import HtmlXPathSelector

	from acny.items import ParishItem

	class ChurchSpider(BaseSpider):
	name = "church"
	allowed_domains = ["achurchnearyou.com"]
	start_urls = ["http://www.achurchnearyou.com/venue.php?V=00001"]

	current_id = 63

	def parse(self, response):
	current_id = getattr(self, 'current_id')
	hxs = HtmlXPathSelector(response)

	item = ParishItem()

	item['archdeaconry'] = hxs.select("//a[starts-with(@href,'/archdeaconry/')]/text()").extract()[0]
	item['archdeaconry_id'] = hxs.select("//a[starts-with(@href,'/archdeaconry/')]/@href").extract()[0].split("/")[2]

	item['deanery'] = hxs.select("//a[starts-with(@href,'/deanery/')]/text()").extract()[0]
	item['deanery_id'] = hxs.select("//a[starts-with(@href,'/deanery/')]/@href").extract()[0].split("/")[2]

	item['legal_name'] = hxs.select("//a[starts-with(@href,'/parish/')]/text()").extract()[0]
	item['parish_id'] = hxs.select("//a[starts-with(@href,'/parish/')]/@href").extract()[0].split("/")[2]

	item['benefice'] = hxs.select("//a[starts-with(@href,'/benefice/')]/text()").extract()[0]
	item['benefice_id'] = hxs.select("//a[starts-with(@href,'/benefice/')]/@href").extract()[0].split("/")[2]

	setattr(self, 'current_id', current_id + 1)
	next_url = "http://www.achurchnearyou.com/venue.php?V=%04d" % (current_id + 1)
	req = Request(next_url, callback=self.parse)

	return [item, req]

	def parse_details(self, response):
	hxs = HtmlXPathSelector(response)

	title = hxs.select("//h1/text()").extract()[0]
	date = hxs.select('//p[@class="date"]/text()').extract()[0].strip()
	desc_ps = hxs.select("//div[@id='main_content' and @class='col six float single']/p/text()").extract()
	desc = "\n".join(desc_ps)

	return ParishItem(name=title, date=date, desc=desc, url=response.url)