Skip to content

Instantly share code, notes, and snippets.

@Parth-Vader
Created July 10, 2017 13:52
Show Gist options
  • Save Parth-Vader/405852f891cdf2b568008c65699b86d1 to your computer and use it in GitHub Desktop.
Save Parth-Vader/405852f891cdf2b568008c65699b86d1 to your computer and use it in GitHub Desktop.
Program to run the spider benchmark with pypy
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import re
import datetime
from six.moves.urllib.parse import urlparse
import click
import scrapy
from scrapy.http import Request, HtmlResponse
from scrapy.linkextractors import LinkExtractor
#from books.items import Page
from scrapy.crawler import CrawlerProcess
from scrapy.crawler import CrawlerRunner
from twisted.internet import reactor, defer
from scrapy.utils.project import get_project_settings
from scrapy.utils.log import configure_logging
class Page(scrapy.Item):
url = scrapy.Field()
title = scrapy.Field()
size = scrapy.Field()
referer = scrapy.Field()
newcookies = scrapy.Field()
body = scrapy.Field()
rating = scrapy.Field()
price = scrapy.Field()
category = scrapy.Field()
stock = scrapy.Field()
class FollowAllSpider(scrapy.Spider):
name = 'followall'
ratings_map = {
'one': 1,
'two': 2,
'three': 3,
'four': 4,
'five': 5,
}
def __init__(self, **kw):
super(FollowAllSpider, self).__init__(**kw)
url = 'http://localhost/books.toscrape.com/index.html'
if not url.startswith('http://') and not url.startswith('https://'):
url = 'http://%s/' % url
self.url = url
self.allowed_domains = [re.sub(r'^www\.', '', urlparse(url).hostname)]
self.link_extractor = LinkExtractor()
self.cookies_seen = set()
self.previtem = 0
self.items = 0
self.timesec = datetime.datetime.utcnow()
def start_requests(self):
return [Request(self.url, callback=self.parse, dont_filter=True)]
def parse(self, response):
"""Parse a PageItem and all requests to follow"""
page = self._get_item(response)
r = [page]
r.extend(self._extract_requests(response))
self.items = self.crawler.stats.get_value('item_scraped_count', 0)
pages = self.crawler.stats.get_value('response_received_count', 0)
a = self.crawler.stats.get_value('start_time')
b = datetime.datetime.utcnow()
self.timesec = b - a
return r
def close(self, reason):
with open("Benchmark.txt", 'a') as f:
f.write(" {0}".format(
(self.items * (1 / self.timesec.total_seconds()))))
click.secho("\nThe average speed of the spider is {0} items/sec\n".format(
self.items * (1 / self.timesec.total_seconds())), bold=True)
def _get_item(self, response):
item = Page(
url=response.url,
size=str(len(response.body)),
referer=response.request.headers.get('Referer'),
rating=response.css('p.star-rating::attr(class)').extract_first().split(' ')[-1],
title=response.css('.product_main h1::text').extract_first(),
price=response.css('.product_main p.price_color::text').re_first('£(.*)'),
stock=''.join(response.css('.product_main .instock.availability ::text').re('(\d+)')),
category=''.join(response.css('ul.breadcrumb li:nth-last-child(2) ::text').extract()).strip(),
)
self._set_new_cookies(item, response)
return item
def _extract_requests(self, response):
r = []
if isinstance(response, HtmlResponse):
links = self.link_extractor.extract_links(response)
r.extend(Request(x.url, callback=self.parse) for x in links)
return r
def _set_new_cookies(self, page, response):
cookies = []
for cookie in [x.split(b';', 1)[0] for x in
response.headers.getlist('Set-Cookie')]:
if cookie not in self.cookies_seen:
self.cookies_seen.add(cookie)
cookies.append(cookie)
if cookies:
page['newcookies'] = cookies
configure_logging()
runner = CrawlerRunner()
@defer.inlineCallbacks
def crawl():
yield runner.crawl(FollowAllSpider)
yield runner.crawl(FollowAllSpider)
# yield runner.crawl(FollowAllSpider)
reactor.stop()
crawl()
reactor.run()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment