Skip to content

Instantly share code, notes, and snippets.

@Pradip-p
Created February 22, 2022 07:33
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Pradip-p/c0fe28d933fa0c0860116019b42120c6 to your computer and use it in GitHub Desktop.
Save Pradip-p/c0fe28d933fa0c0860116019b42120c6 to your computer and use it in GitHub Desktop.
One page Scrapy Python
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
class LazyCrawler(scrapy.Spider):
name = "quotes"
def start_requests(self):
url = 'http://quotes.toscrape.com/'
tag = getattr(self, 'tag', None)
if tag is not None:
url = url + 'tag/' + tag
yield scrapy.Request(url, self.parse)
def parse(self, response):
# to_browser(response)
for quote in response.css('div.quote'):
yield {
'author': quote.css('small.author::text').get(),
'text': quote.css('span.text::text').get(),
'tag':quote.css('.tags a.tag::text').get(),
}
next_page = response.css('li.next a::attr(href)').get()
if next_page is not None:
yield response.follow(next_page, self.parse)
process = CrawlerProcess(get_project_settings())
process.crawl(LazyCrawler)
process.start() # the script will block here until the crawling is finished
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment