Skip to content

Instantly share code, notes, and snippets.

@alecxe

alecxe/runner.py

Last active Jan 10, 2020
Embed
What would you like to do?
Self-contained minimum example script to run scrapy
import json
from scrapy.crawler import Crawler
from scrapy.contrib.loader import ItemLoader
from scrapy.contrib.loader.processor import Join, MapCompose, TakeFirst
from scrapy import log, signals, Spider, Item, Field
from scrapy.settings import Settings
from twisted.internet import reactor
# define an item class
class DmozItem(Item):
title = Field()
link = Field()
desc = Field()
# define an item loader with input and output processors
class DmozItemLoader(ItemLoader):
default_input_processor = MapCompose(unicode.strip)
default_output_processor = TakeFirst()
desc_out = Join()
# define a pipeline
class JsonWriterPipeline(object):
def __init__(self):
self.file = open('items.jl', 'wb')
def process_item(self, item, spider):
line = json.dumps(dict(item)) + "\n"
self.file.write(line)
return item
# define a spider
class DmozSpider(Spider):
name = "dmoz"
allowed_domains = ["dmoz.org"]
start_urls = [
"http://www.dmoz.org/Computers/Programming/Languages/Python/Books/",
"http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/"
]
def parse(self, response):
for sel in response.xpath('//ul/li'):
loader = DmozItemLoader(DmozItem(), selector=sel, response=response)
loader.add_xpath('title', 'a/text()')
loader.add_xpath('link', 'a/@href')
loader.add_xpath('desc', 'text()')
yield loader.load_item()
# callback fired when the spider is closed
def callback(spider, reason):
stats = spider.crawler.stats.get_stats() # collect/log stats?
# stop the reactor
reactor.stop()
# instantiate settings and provide a custom configuration
settings = Settings()
settings.set('ITEM_PIPELINES', {
'__main__.JsonWriterPipeline': 100
})
# instantiate a crawler passing in settings
crawler = Crawler(settings)
# instantiate a spider
spider = DmozSpider()
# configure signals
crawler.signals.connect(callback, signal=signals.spider_closed)
# configure and start the crawler
crawler.configure()
crawler.crawl(spider)
crawler.start()
# start logging
log.start()
# start the reactor (blocks execution)
reactor.run()
@frankjdelgado

This comment has been minimized.

Copy link

@frankjdelgado frankjdelgado commented Oct 4, 2016

scrapy.contrib is deprecated, they recommend to use this one instead:

from scrapy.loader import ItemLoader
from scrapy.loader.processors import TakeFirst, MapCompose, Join
@michaelbukachi

This comment has been minimized.

Copy link

@michaelbukachi michaelbukachi commented May 15, 2017

You could use this also:
process = CrawlerProcess({ 'ITEM_PIPELINES', { '__main__.JsonWriterPipeline': 100} })
process.crawl(DmozSpider)
process.start()

No need for a callback or starting/stopping the reactor manually

@dashw00d

This comment has been minimized.

Copy link

@dashw00d dashw00d commented Dec 24, 2018

This was exactly what I was looking for!!! I've been wanting to transition my Scrapy projects to stand-alone for a while now. Thanks!!!
Here's how I ended up doing the settings:

settings = Settings({
    # piplines start with the project/module name so replace with __main__
    'ITEM_PIPELINES': {
        '__main__.WriterPipeline': 100,
    },
    'DOWNLOADER_MIDDLEWARES': {
        'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
    }
})
process = CrawlerProcess(settings)

# you can run 30 of these at once if you want, e.g —
# process.crawl(CustomSpider)
# process.crawl(CustomSpider) etc.. * 30
process.crawl(CustomSpider)
process.start()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.