Skip to content

Instantly share code, notes, and snippets.

/items.py Secret

Created December 28, 2012 23:50
Show Gist options
  • Save anonymous/da1b487b2d7790b6f954 to your computer and use it in GitHub Desktop.
Save anonymous/da1b487b2d7790b6f954 to your computer and use it in GitHub Desktop.
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/topics/items.html
from scrapy.item import Item, Field
class ParishItem(Item):
# define the fields for your item here like:
# name = Field()
legal_name = Field()
archdeaconry = Field()
archdeaconry_id = Field()
deanery = Field()
deanery_id = Field()
parish_id = Field()
benefice = Field()
benefice_id = Field()
pass
class DioceseItem(Item):
# define the fields for your item here like:
# name = Field()
diocese_name = Field()
diocese_id = Field()
pass
from scrapy import log
class CustomDownloaderMiddleware(object):
def process_request(self, request, spider):
log.msg("### Process Request Middleware " + request.url, level=log.WARNING)
current_id = getattr(spider, 'current_id')
log.msg("### Process Request Middleware %d" % (current_id))
if request.url == "http://www.achurchnearyou.com//":
log.msg("### Process Request Middleware - Got redirected to home - creating new request")
setattr(spider, 'current_id', current_id + 1)
return request.replace(url="http://www.achurchnearyou.com/venue.php?V=%04d" % (current_id + 1))
# def process_response(self, request, response, spider):
# log.msg("In Middleware " + response.url + " " + request.url, level=log.WARNING)
# if response.url == "http://www.achurchnearyou.com//":
# raise IgnoreRequest("Ignored as redirected to home page! #ROBIN")
# else:
# return response
# return response
from scrapy.spider import BaseSpider
from scrapy.http import Request
from scrapy.selector import HtmlXPathSelector
from acny.items import ParishItem
class ChurchSpider(BaseSpider):
name = "church"
allowed_domains = ["achurchnearyou.com"]
start_urls = ["http://www.achurchnearyou.com/venue.php?V=00001"]
current_id = 63
def parse(self, response):
current_id = getattr(self, 'current_id')
hxs = HtmlXPathSelector(response)
item = ParishItem()
item['archdeaconry'] = hxs.select("//a[starts-with(@href,'/archdeaconry/')]/text()").extract()[0]
item['archdeaconry_id'] = hxs.select("//a[starts-with(@href,'/archdeaconry/')]/@href").extract()[0].split("/")[2]
item['deanery'] = hxs.select("//a[starts-with(@href,'/deanery/')]/text()").extract()[0]
item['deanery_id'] = hxs.select("//a[starts-with(@href,'/deanery/')]/@href").extract()[0].split("/")[2]
item['legal_name'] = hxs.select("//a[starts-with(@href,'/parish/')]/text()").extract()[0]
item['parish_id'] = hxs.select("//a[starts-with(@href,'/parish/')]/@href").extract()[0].split("/")[2]
item['benefice'] = hxs.select("//a[starts-with(@href,'/benefice/')]/text()").extract()[0]
item['benefice_id'] = hxs.select("//a[starts-with(@href,'/benefice/')]/@href").extract()[0].split("/")[2]
setattr(self, 'current_id', current_id + 1)
next_url = "http://www.achurchnearyou.com/venue.php?V=%04d" % (current_id + 1)
req = Request(next_url, callback=self.parse)
return [item, req]
def parse_details(self, response):
hxs = HtmlXPathSelector(response)
title = hxs.select("//h1/text()").extract()[0]
date = hxs.select('//p[@class="date"]/text()').extract()[0].strip()
desc_ps = hxs.select("//div[@id='main_content' and @class='col six float single']/p/text()").extract()
desc = "\n".join(desc_ps)
return ParishItem(name=title, date=date, desc=desc, url=response.url)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment