Skip to content

Instantly share code, notes, and snippets.

@iAnanich
Last active April 27, 2018 13:02
Show Gist options
  • Save iAnanich/99d9b22f3d5e0288906c353663b1c3ff to your computer and use it in GitHub Desktop.
Save iAnanich/99d9b22f3d5e0288906c353663b1c3ff to your computer and use it in GitHub Desktop.
Scrapy httpbin.org/anything spider for benchmarking purposes
# put it with settings.py and spiders folder
from urllib.parse import urlparse, urlencode, urlunparse
def update_query(url: str, query_dict: dict) -> str:
QUERY_INDEX = 4
components = urlparse(url)
query = components[QUERY_INDEX]
if query:
dict_query = {
k: v for k, v in
(kv.split('=') for kv in query.split('&'))
}
else:
dict_query = {}
dict_query.update(query_dict)
query = urlencode(dict_query)
components = list(components)
components[QUERY_INDEX] = query
updated = urlunparse(components)
return updated
# -*- coding: utf-8 -*-
import scrapy
from ..common import update_query
class HttpbinSpider(scrapy.Spider):
name = 'httpbin'
allowed_domains = ['httpbin.org']
BASE_URL = 'http://httpbin.org/anything'
DEEP_FACTOR = 16
BREADTH_FACTOR = 16
def start_requests(self):
for breadth in range(self.BREADTH_FACTOR):
meta = {'breadth': breadth}
url = update_query(self.BASE_URL, meta)
yield scrapy.Request(url, meta=meta, callback=self.parse_breadth)
def parse_breadth(self, response):
yield {
'breadth': response.meta['breadth'],
'breadth_latency': response.meta['download_latency'],
}
for deep in range(self.DEEP_FACTOR):
meta = {
'breadth': response.meta['breadth'],
'breadth_latency': response.meta['download_latency'],
'deep': deep,
}
url = update_query(response.url, meta)
yield scrapy.Request(url, meta=meta, callback=self.parse_deep)
def parse_deep(self, response):
breadth_latency = response.meta['breadth_latency']
yield {
'breadth': response.meta['breadth'],
'deep': response.meta['deep'],
'total_latency': response.meta['download_latency']+breadth_latency,
'breadth_latency': breadth_latency,
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment