Skip to content

Instantly share code, notes, and snippets.

@rshyam1
Created May 24, 2016 01:40
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save rshyam1/b84459a0057d232d9266d146c934d233 to your computer and use it in GitHub Desktop.
Save rshyam1/b84459a0057d232d9266d146c934d233 to your computer and use it in GitHub Desktop.
def parse_main_page(self, response):
ids = response.xpath('//*[@class="row"]/@data-pid').extract()
for id in ids:
link = 'https://newyork.craigslist.org/stn/cto/' + str(id) + '.html'
yield Request(link, callback=self.parse_detail_page)
def parse_detail_page(self, response):
price = response.xpath('//*[@class = "price"]/text()').extract()[0]
title = response.xpath('//*[@ id = "titletextonly"]/text()').extract()[0]
post_time=response.xpath('//*[@id = "pagecontainer"]/section/section/div[2]/p[2]/time/text()').extract()[0]
body = response.xpath('//*[@id = "postingbody"]//text()').extract()
body = reduce(lambda x,y: str(x).strip() + ' ' + str(y).strip(), body)
item = DemoItem()
item['price'] = str(price)
item['title'] = str(title)
item['post_time']=str(post_time)
item['body'] = body
# item['update_time']=update_time
yield item
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment