Skip to content

Instantly share code, notes, and snippets.

@harrywang
Last active December 3, 2019 15:13
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save harrywang/1d2700f2b6ce9047175905ad22a950be to your computer and use it in GitHub Desktop.
Save harrywang/1d2700f2b6ce9047175905ad22a950be to your computer and use it in GitHub Desktop.
import scrapy
from scrapy.loader import ItemLoader
from tutorial.items import QuoteItem
class QuotesSpider(scrapy.Spider):
name = "quotes"
allowed_domains = ["toscrape.com"]
start_urls = ['http://quotes.toscrape.com/']
def parse(self, response):
self.logger.info('Parse function called on {}'.format(response.url))
# quotes = response.xpath("//div[@class='quote']")
quotes = response.css('div.quote')
for quote in quotes:
loader = ItemLoader(item=QuoteItem(), selector=quote)
# pay attention to the dot .// to use relative xpath
# loader.add_xpath('quote_content', ".//span[@class='text']/text()")
loader.add_css('quote_content', '.text::text')
# loader.add_xpath('author', './/small//text()')
loader.add_css('tags', '.tag::text')
quote_item = loader.load_item()
author_url = quote.css('.author + a::attr(href)').get()
# go to the author page and pass the current collected quote info
yield response.follow(author_url, self.parse_author, meta={'quote_item': quote_item})
# go to Next page
for a in response.css('li.next a'):
yield response.follow(a, self.parse)
def parse_author(self, response):
quote_item = response.meta['quote_item']
loader = ItemLoader(item=quote_item, response=response)
loader.add_css('author_name', '.author-title::text')
loader.add_css('author_birthday', '.author-born-date::text')
loader.add_css('author_bornlocation', '.author-born-location::text')
loader.add_css('author_bio', '.author-description::text')
yield loader.load_item()
@deleyva
Copy link

deleyva commented Nov 29, 2019

The first i of the first import is missing.

Thanks for your great tutorial!

@harrywang
Copy link
Author

@deleyva thanks for pointing this out! I have fixed it.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment