Skip to content

Instantly share code, notes, and snippets.

@GeorgeA92
Last active November 9, 2020 16:30
Show Gist options
  • Save GeorgeA92/d2ff449f30fdbe773140288a4f4dbc67 to your computer and use it in GitHub Desktop.
Save GeorgeA92/d2ff449f30fdbe773140288a4f4dbc67 to your computer and use it in GitHub Desktop.
per request delay implementaion for scrapy app.
import scrapy
from scrapy.crawler import CrawlerProcess
import random
from scrapy.core.downloader import Slot
from time import time
# This is updated version of my per request delay implementation mentioned in:
# https://github.com/scrapy/scrapy/issues/802#issuecomment-500245345
# scrapy v. 2.4.0
RESULT_PAGE_PRIORITY = 0
PRODUCT_PAGE_PRIORITY = 100
class PerRequestDelaySlot(Slot):
def download_delay(self):
if self.queue:
if "per_request_delay" in self.queue[0][0].meta.keys():
#print("PER_REQUEST_DELAY:" +str(self.queue[0][0].meta["per_request_delay"]))
return self.queue[0][0].meta["per_request_delay"]
#from original:
if self.randomize_delay: #
return random.uniform(0.5 * self.delay, 1.5 * self.delay)
return self.delay
class BooksToScrapeSpider(scrapy.Spider):
name = "books_to_scrape"
custom_settings = {
"FEEDS":{
'items.csv': { 'format': 'csv',}
},
"DOWNLOAD_DELAY":0.05}
def start_requests(self):
self.crawler.engine.downloader.slots["books.toscrape.com"] = PerRequestDelaySlot(concurrency=8,delay=0,randomize_delay=False)
yield scrapy.Request(url='http://books.toscrape.com/catalogue/page-1.html',
callback=self.parse, priority=RESULT_PAGE_PRIORITY,
meta={"per_request_delay": 5}
)
#extra_test_delay = 30 # seconds
#self.crawler.engine.downloader.slots["books.toscrape.com"].lastseen = time() + extra_test_delay
def parse(self, response):
for product_link in response.css("ol.row h3 a ::attr(href)").extract():
yield scrapy.Request(response.urljoin(product_link),
callback=self.parse_product_page,
priority=PRODUCT_PAGE_PRIORITY,
meta={"per_request_delay":1})
next = response.css("li.next a ::attr(href)").extract_first()
#if response.url == "http://books.toscrape.com/catalogue/page-1.html":
# extra_test_delay = 30 # seconds
# self.crawler.engine.downloader.slots["books.toscrape.com"].latercall.delay(extra_test_delay)
if next:
yield scrapy.Request(url=response.urljoin(next),
callback=self.parse,
priority=RESULT_PAGE_PRIORITY,
meta={"per_request_delay": 5})
def parse_product_page(self, response):
product_container= response.css("article.product_page")
product = {}
product["product_name"] = product_container.css("div.product_main h1 ::text").extract_first()
product["product_description"] = product_container.xpath(
"//div[@id='product_description']/following::p/text()").extract_first()
for row in product_container.css("table tr"):
product[row.css("th::text").extract_first()] = row.css("td::text").extract_first()
#yield product
def err_back(self, failure):
#Not finished yet
pass
process = CrawlerProcess()
process.crawl(BooksToScrapeSpider)
process.start()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment