Created
July 10, 2017 13:52
-
-
Save Parth-Vader/405852f891cdf2b568008c65699b86d1 to your computer and use it in GitHub Desktop.
Program to run the spider benchmark with pypy
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
import re | |
import datetime | |
from six.moves.urllib.parse import urlparse | |
import click | |
import scrapy | |
from scrapy.http import Request, HtmlResponse | |
from scrapy.linkextractors import LinkExtractor | |
#from books.items import Page | |
from scrapy.crawler import CrawlerProcess | |
from scrapy.crawler import CrawlerRunner | |
from twisted.internet import reactor, defer | |
from scrapy.utils.project import get_project_settings | |
from scrapy.utils.log import configure_logging | |
class Page(scrapy.Item): | |
url = scrapy.Field() | |
title = scrapy.Field() | |
size = scrapy.Field() | |
referer = scrapy.Field() | |
newcookies = scrapy.Field() | |
body = scrapy.Field() | |
rating = scrapy.Field() | |
price = scrapy.Field() | |
category = scrapy.Field() | |
stock = scrapy.Field() | |
class FollowAllSpider(scrapy.Spider): | |
name = 'followall' | |
ratings_map = { | |
'one': 1, | |
'two': 2, | |
'three': 3, | |
'four': 4, | |
'five': 5, | |
} | |
def __init__(self, **kw): | |
super(FollowAllSpider, self).__init__(**kw) | |
url = 'http://localhost/books.toscrape.com/index.html' | |
if not url.startswith('http://') and not url.startswith('https://'): | |
url = 'http://%s/' % url | |
self.url = url | |
self.allowed_domains = [re.sub(r'^www\.', '', urlparse(url).hostname)] | |
self.link_extractor = LinkExtractor() | |
self.cookies_seen = set() | |
self.previtem = 0 | |
self.items = 0 | |
self.timesec = datetime.datetime.utcnow() | |
def start_requests(self): | |
return [Request(self.url, callback=self.parse, dont_filter=True)] | |
def parse(self, response): | |
"""Parse a PageItem and all requests to follow""" | |
page = self._get_item(response) | |
r = [page] | |
r.extend(self._extract_requests(response)) | |
self.items = self.crawler.stats.get_value('item_scraped_count', 0) | |
pages = self.crawler.stats.get_value('response_received_count', 0) | |
a = self.crawler.stats.get_value('start_time') | |
b = datetime.datetime.utcnow() | |
self.timesec = b - a | |
return r | |
def close(self, reason): | |
with open("Benchmark.txt", 'a') as f: | |
f.write(" {0}".format( | |
(self.items * (1 / self.timesec.total_seconds())))) | |
click.secho("\nThe average speed of the spider is {0} items/sec\n".format( | |
self.items * (1 / self.timesec.total_seconds())), bold=True) | |
def _get_item(self, response): | |
item = Page( | |
url=response.url, | |
size=str(len(response.body)), | |
referer=response.request.headers.get('Referer'), | |
rating=response.css('p.star-rating::attr(class)').extract_first().split(' ')[-1], | |
title=response.css('.product_main h1::text').extract_first(), | |
price=response.css('.product_main p.price_color::text').re_first('£(.*)'), | |
stock=''.join(response.css('.product_main .instock.availability ::text').re('(\d+)')), | |
category=''.join(response.css('ul.breadcrumb li:nth-last-child(2) ::text').extract()).strip(), | |
) | |
self._set_new_cookies(item, response) | |
return item | |
def _extract_requests(self, response): | |
r = [] | |
if isinstance(response, HtmlResponse): | |
links = self.link_extractor.extract_links(response) | |
r.extend(Request(x.url, callback=self.parse) for x in links) | |
return r | |
def _set_new_cookies(self, page, response): | |
cookies = [] | |
for cookie in [x.split(b';', 1)[0] for x in | |
response.headers.getlist('Set-Cookie')]: | |
if cookie not in self.cookies_seen: | |
self.cookies_seen.add(cookie) | |
cookies.append(cookie) | |
if cookies: | |
page['newcookies'] = cookies | |
configure_logging() | |
runner = CrawlerRunner() | |
@defer.inlineCallbacks | |
def crawl(): | |
yield runner.crawl(FollowAllSpider) | |
yield runner.crawl(FollowAllSpider) | |
# yield runner.crawl(FollowAllSpider) | |
reactor.stop() | |
crawl() | |
reactor.run() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment