Skip to content

Instantly share code, notes, and snippets.

@dyerrington
Created September 25, 2016 19:50
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dyerrington/5ef4da495404b5c4340af29b66e14ea8 to your computer and use it in GitHub Desktop.
Save dyerrington/5ef4da495404b5c4340af29b66e14ea8 to your computer and use it in GitHub Desktop.
Example for class for scraping images from Craigslist details. The only thing you need to do in order to get the image model to work correctly, is to update the items.py file to include it.
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
import scrapy
# item models
from craigslist.items import CraigslistItem, CraigslistItemDetail, CraigslistImage
class CraigslistSpider(CrawlSpider):
name = "craigslist"
allowed_domains = ["craigslist.org"]
start_urls = [
"https://losangeles.craigslist.org/search/rva?query=airstream"
]
rules = (
# Rule(LinkExtractor(allow=(), restrict_xpaths=('//a[@class="button next"]',)), follow = True),
Rule(LinkExtractor(allow=(), restrict_xpaths=('//a[@class="hdrlnk"]',)), callback="parse_image_details", follow = True),
)
def parse_image_details(self, response):
images = response.xpath("//img/@src")
for image in images.extract():
item = CraigslistImage()
item['image_url'] = image
item['post_url'] = response.url
yield item
# def parse_search(self, response):
# """
# We will be retiring our parse_search method
# """
# for sel in response.xpath("//p[@class='row']"):
# # //span[@class='price']
# item = CraigslistItem()
# item['title'] = sel.xpath("span/span/a[@class='hdrlnk']").extract()[0]
# item['link'] = sel.xpath("span/span/a/@href").extract()[0]
# item['price'] = sel.xpath("span/span/span[@class='price']/text()").extract()[0]
# yield item
# def parse_page(self, response):
# print "Parsing a detail page!"
# item_detail = CraigslistItemDetail()
# extracted = {
# "address": response.xpath("//div[@class='mapaddress']/text()").extract(),
# "movein_date": response.xpath("//span[contains(@class, 'housing_movein_now')]/text()").extract(),
# "price": response.xpath("whatever th ehll the query is").extract(),
# }
# # Check that our extracted entities are not empty
# for entity, extracted in extracted.items():
# if len(extracted) > 0:
# # if not, extract the first item (expecting single list entities)
# item_detail[entity] = extracted[0]
# yield item_detail
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment