Skip to content

Instantly share code, notes, and snippets.

@aconanlai
Last active July 9, 2016 23:53
Show Gist options
  • Save aconanlai/a1407841c47cc3751c221a750b77f772 to your computer and use it in GitHub Desktop.
Save aconanlai/a1407841c47cc3751c221a750b77f772 to your computer and use it in GitHub Desktop.
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from craig.items import CraigslistItem
from scrapy.http import Request
class MySpider(CrawlSpider):
name = "craig"
allowed_domains = ["craigslist.org"]
start_urls = ["https://cleveland.craigslist.org/search/mis"]
rules = (
Rule(SgmlLinkExtractor(allow=(''), restrict_xpaths=('//a[@class="button next"]',)), callback='parse_page', follow= True),
)
def parse_page(self, response):
items = []
hxs = HtmlXPathSelector(response)
titles = hxs.xpath("//span[@class='pl']")
for titles in titles:
item = CraigslistItem()
item["title"] = titles.select("a/span/text()").extract()
item["link"] = titles.select("a/@href").extract()
url = 'https://cleveland.craigslist.org{}'.format(''.join(item['link']))
yield Request(url=url, meta={'item': item}, callback=self.parse_item_page)
def parse_item_page(self, response):
response = response.replace(body=response.body.replace('<br>', ''))
hxs = HtmlXPathSelector(response)
item = response.meta['item']
item['description'] = response.xpath("normalize-space(.//section[@id='postingbody']/text())").extract()
return item
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment