Last active
December 3, 2019 15:13
-
-
Save harrywang/1d2700f2b6ce9047175905ad22a950be to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import scrapy | |
from scrapy.loader import ItemLoader | |
from tutorial.items import QuoteItem | |
class QuotesSpider(scrapy.Spider): | |
name = "quotes" | |
allowed_domains = ["toscrape.com"] | |
start_urls = ['http://quotes.toscrape.com/'] | |
def parse(self, response): | |
self.logger.info('Parse function called on {}'.format(response.url)) | |
# quotes = response.xpath("//div[@class='quote']") | |
quotes = response.css('div.quote') | |
for quote in quotes: | |
loader = ItemLoader(item=QuoteItem(), selector=quote) | |
# pay attention to the dot .// to use relative xpath | |
# loader.add_xpath('quote_content', ".//span[@class='text']/text()") | |
loader.add_css('quote_content', '.text::text') | |
# loader.add_xpath('author', './/small//text()') | |
loader.add_css('tags', '.tag::text') | |
quote_item = loader.load_item() | |
author_url = quote.css('.author + a::attr(href)').get() | |
# go to the author page and pass the current collected quote info | |
yield response.follow(author_url, self.parse_author, meta={'quote_item': quote_item}) | |
# go to Next page | |
for a in response.css('li.next a'): | |
yield response.follow(a, self.parse) | |
def parse_author(self, response): | |
quote_item = response.meta['quote_item'] | |
loader = ItemLoader(item=quote_item, response=response) | |
loader.add_css('author_name', '.author-title::text') | |
loader.add_css('author_birthday', '.author-born-date::text') | |
loader.add_css('author_bornlocation', '.author-born-location::text') | |
loader.add_css('author_bio', '.author-description::text') | |
yield loader.load_item() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
The first
i
of the firstimport
is missing.Thanks for your great tutorial!