Skip to content

Instantly share code, notes, and snippets.

@eupendra
Created July 1, 2020 04:50
Show Gist options
  • Save eupendra/6b42b0703c868a9112d9c3289f1b2f87 to your computer and use it in GitHub Desktop.
Save eupendra/6b42b0703c868a9112d9c3289f1b2f87 to your computer and use it in GitHub Desktop.
# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
class h1_tagsSpider(CrawlSpider):
name = 'h1_tags'
allowed_domains = ['books.toscrape.com']
start_urls = ['http://books.toscrape.com/']
rules = (
Rule(LinkExtractor(), callback='parse_item', follow=True),
)
def parse_item(self, response):
all_headings = response.xpath('//h1/text()').getall()
for heading in all_headings:
yield {
'text': heading,
'page': response.url
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment