Skip to content

Instantly share code, notes, and snippets.

@arunshaji95
Last active April 20, 2018 07:03
Show Gist options
  • Save arunshaji95/1ff481d577fd8c12a98eaad7659a3a7d to your computer and use it in GitHub Desktop.
Save arunshaji95/1ff481d577fd8c12a98eaad7659a3a7d to your computer and use it in GitHub Desktop.
Basic scrapy tutorial
import scrapy
class BookSpider(scrapy.Spider):
name = 'bookspider'
start_urls = [
'http://books.toscrape.com/'
]
def parse(self, response):
for link in response.xpath('//article[@class="product_pod"]/div/a/@href').extract():
yield response.follow(link, callback=self.parse_detail)
next_page = response.xpath('//li[@class="next"]/a/@href').extract_first()
if next_page:
yield response.follow(next_page, callback=self.parse)
def parse_detail(self, response):
title = response.xpath('//div[contains(@class, "product_main")]/h1/text()').extract_first()
price = response.xpath('//div[contains(@class, "product_main")]/'
'p[@class="price_color"]/text()').extract_first()
availability = response.xpath('//div[contains(@class, "product_main")]/'
'p[contains(@class, "availability")]/text()').extract()
availability = ''.join(availability).strip()
upc = response.xpath('//th[contains(text(), "UPC")]/'
'following-sibling::td/text()').extract_first()
yield {
'title': title,
'price': price,
'availability': availability,
'upc': upc
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment