Skip to content

Instantly share code, notes, and snippets.

Last active Aug 5, 2021
What would you like to do?
Self-contained minimum example script to run scrapy
import json
from scrapy.crawler import Crawler
from scrapy.contrib.loader import ItemLoader
from scrapy.contrib.loader.processor import Join, MapCompose, TakeFirst
from scrapy import log, signals, Spider, Item, Field
from scrapy.settings import Settings
from twisted.internet import reactor
# define an item class
class DmozItem(Item):
title = Field()
link = Field()
desc = Field()
# define an item loader with input and output processors
class DmozItemLoader(ItemLoader):
default_input_processor = MapCompose(unicode.strip)
default_output_processor = TakeFirst()
desc_out = Join()
# define a pipeline
class JsonWriterPipeline(object):
def __init__(self):
self.file = open('items.jl', 'wb')
def process_item(self, item, spider):
line = json.dumps(dict(item)) + "\n"
return item
# define a spider
class DmozSpider(Spider):
name = "dmoz"
allowed_domains = [""]
start_urls = [
def parse(self, response):
for sel in response.xpath('//ul/li'):
loader = DmozItemLoader(DmozItem(), selector=sel, response=response)
loader.add_xpath('title', 'a/text()')
loader.add_xpath('link', 'a/@href')
loader.add_xpath('desc', 'text()')
yield loader.load_item()
# callback fired when the spider is closed
def callback(spider, reason):
stats = spider.crawler.stats.get_stats() # collect/log stats?
# stop the reactor
# instantiate settings and provide a custom configuration
settings = Settings()
settings.set('ITEM_PIPELINES', {
'__main__.JsonWriterPipeline': 100
# instantiate a crawler passing in settings
crawler = Crawler(settings)
# instantiate a spider
spider = DmozSpider()
# configure signals
crawler.signals.connect(callback, signal=signals.spider_closed)
# configure and start the crawler
# start logging
# start the reactor (blocks execution)
Copy link

frankjdelgado commented Oct 4, 2016

scrapy.contrib is deprecated, they recommend to use this one instead:

from scrapy.loader import ItemLoader
from scrapy.loader.processors import TakeFirst, MapCompose, Join

Copy link

michaelbukachi commented May 15, 2017

You could use this also:
process = CrawlerProcess({ 'ITEM_PIPELINES', { '__main__.JsonWriterPipeline': 100} })

No need for a callback or starting/stopping the reactor manually

Copy link

dashw00d commented Dec 24, 2018

This was exactly what I was looking for!!! I've been wanting to transition my Scrapy projects to stand-alone for a while now. Thanks!!!
Here's how I ended up doing the settings:

settings = Settings({
    # piplines start with the project/module name so replace with __main__
        '__main__.WriterPipeline': 100,
        'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
process = CrawlerProcess(settings)

# you can run 30 of these at once if you want, e.g —
# process.crawl(CustomSpider)
# process.crawl(CustomSpider) etc.. * 30

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment