aconanlai/cl_spider.py

## cl_spider.py
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from craig.items import CraigslistItem
from scrapy.http import Request

class MySpider(CrawlSpider):
    name = "craig"
    allowed_domains = ["craigslist.org"]
    start_urls = ["https://cleveland.craigslist.org/search/mis"]

    rules = (
        Rule(SgmlLinkExtractor(allow=(''), restrict_xpaths=('//a[@class="button next"]',)), callback='parse_page', follow= True),
    )

    def parse_page(self, response):
        items = []
        hxs = HtmlXPathSelector(response)
        titles = hxs.xpath("//span[@class='pl']")
        for titles in titles:
            item = CraigslistItem()
            item["title"] = titles.select("a/span/text()").extract()
            item["link"] = titles.select("a/@href").extract()
            url = 'https://cleveland.craigslist.org{}'.format(''.join(item['link']))
            yield Request(url=url, meta={'item': item}, callback=self.parse_item_page)


    def parse_item_page(self, response):
        response = response.replace(body=response.body.replace('<br>', ''))
        hxs = HtmlXPathSelector(response)
        item = response.meta['item']
        item['description'] = response.xpath("normalize-space(.//section[@id='postingbody']/text())").extract()
        return item
	from scrapy.contrib.spiders import CrawlSpider, Rule
	from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
	from scrapy.selector import HtmlXPathSelector
	from craig.items import CraigslistItem
	from scrapy.http import Request

	class MySpider(CrawlSpider):
	name = "craig"
	allowed_domains = ["craigslist.org"]
	start_urls = ["https://cleveland.craigslist.org/search/mis"]

	rules = (
	Rule(SgmlLinkExtractor(allow=(''), restrict_xpaths=('//a[@class="button next"]',)), callback='parse_page', follow= True),
	)

	def parse_page(self, response):
	items = []
	hxs = HtmlXPathSelector(response)
	titles = hxs.xpath("//span[@class='pl']")
	for titles in titles:
	item = CraigslistItem()
	item["title"] = titles.select("a/span/text()").extract()
	item["link"] = titles.select("a/@href").extract()
	url = 'https://cleveland.craigslist.org{}'.format(''.join(item['link']))
	yield Request(url=url, meta={'item': item}, callback=self.parse_item_page)


	def parse_item_page(self, response):
	response = response.replace(body=response.body.replace('<br>', ''))
	hxs = HtmlXPathSelector(response)
	item = response.meta['item']
	item['description'] = response.xpath("normalize-space(.//section[@id='postingbody']/text())").extract()
	return item