Skip to content

Instantly share code, notes, and snippets.

@ryonlife
Last active May 19, 2020 19:29
Show Gist options
  • Save ryonlife/e9131d8ac2c9ce1d089a98eb346905e7 to your computer and use it in GitHub Desktop.
Save ryonlife/e9131d8ac2c9ce1d089a98eb346905e7 to your computer and use it in GitHub Desktop.
scrapy-autointegration
# -*- coding: utf-8 -*-
import os
import shutil
import time
import yaml
from pegbot.utils import get_project_root
class AutointegrationPipeline:
"""
Pipeline for generating autointegration test and fixture files.
"""
def open_spider(self, spider): # pylint: disable=no-self-use
"""
Create a fresh directory and test files for the spider being run.
"""
if not spider.settings.getbool('AUTOUNIT_ENABLED', default=False):
# Piggyback off autounit settings so integration tests are generated on the same crawl
return
# Directory
path = f'{get_project_root()}/autointegration/tests/{spider.name}'
if os.path.exists(path):
shutil.rmtree(path)
os.mkdir(path)
# Module file
file = open(f'{path}/__init__.py', 'w')
file.close()
# Test file
test = ''
test += "# -*- coding: utf-8 -*-\n"
test += "from autointegration.generate_test import generate_test\n"
test += f"def test_{spider.name}():\n"
test += f"\tgenerate_test('{spider.name}')()\n"
file = open(f'{path}/test_{spider.name}.py', 'w')
file.write(test)
file.close()
def process_item(self, product, spider): # pylint: disable=no-self-use
"""
Create new fixture files.
"""
if not spider.settings.getbool('AUTOUNIT_ENABLED', default=False):
# Piggyback off autounit settings so integration tests are generated on the same crawl
return product
path = f'{get_project_root()}/autointegration/tests/{spider.name}'
file = open(f'{path}/fixture_{int(time.time())}.yaml', 'w')
file.write(yaml.dump({
'url': product['url'],
'product': {
'category': product['category'],
'name': product['name'],
}
}))
file.close()
return product
# -*- coding: utf-8 -*-
import os
import yaml
from scrapy import signals
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
from w3lib.url import canonicalize_url
def generate_test(spider_name):
"""Generates an integration test of a VendorSpider subclass."""
def _test():
"""Performs an integration test of a VendorSpider subclass."""
# Configure the crawler
settings = get_project_settings()
settings['AUTOUNIT_ENABLED'] = False
process = CrawlerProcess(settings)
crawler = process.create_crawler(spider_name)
# Load fixtures from .yaml files
fixtures = {}
path = f'{os.path.dirname(os.path.abspath(__file__))}/tests/{spider_name}'
files = [file for file in os.listdir(path) if file.endswith('.yaml')]
for file in files:
file_path = os.path.join(path, file)
with open(file_path) as file:
fixture = yaml.load(file, Loader=yaml.FullLoader)
fixtures[canonicalize_url(fixture['url'])] = fixture['product']
def _test_parse_product(item):
"""Tests for correct scraping of product info."""
nonlocal fixtures
if item['url'] not in fixtures.keys():
raise AssertionError(f"Product URL mismatch: {canonicalize_url(item['url'])}")
for key, val in fixtures[item['url']].items():
assert item[key] == val
# Attach test handlers to various event signals
# https://docs.scrapy.org/en/latest/topics/signals.html#topics-signals-ref
crawler.signals.connect(_test_parse_product, signal=signals.item_scraped)
# Run the crawler
process.crawl(crawler, *[], seed_urls=fixtures.keys(), crawl_patterns=[])
process.start()
# Integration test fails if errors have been counted in the crawler's stats
if crawler.stats.get_value('log_count/ERROR'):
raise AssertionError
return _test
# -*- coding: utf-8 -*-
from autointegration.generate_test import generate_test
def test_name_of_spider():
generate_test('test_name_of_spider')()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment