Skip to content

Instantly share code, notes, and snippets.

@alecxe
Last active August 23, 2023 14:19
Show Gist options
  • Save alecxe/fc1527d6d9492b59c610 to your computer and use it in GitHub Desktop.
Save alecxe/fc1527d6d9492b59c610 to your computer and use it in GitHub Desktop.
Self-contained minimum example script to run scrapy
import json
from scrapy.crawler import Crawler
from scrapy.contrib.loader import ItemLoader
from scrapy.contrib.loader.processor import Join, MapCompose, TakeFirst
from scrapy import log, signals, Spider, Item, Field
from scrapy.settings import Settings
from twisted.internet import reactor
# define an item class
class DmozItem(Item):
title = Field()
link = Field()
desc = Field()
# define an item loader with input and output processors
class DmozItemLoader(ItemLoader):
default_input_processor = MapCompose(unicode.strip)
default_output_processor = TakeFirst()
desc_out = Join()
# define a pipeline
class JsonWriterPipeline(object):
def __init__(self):
self.file = open('items.jl', 'wb')
def process_item(self, item, spider):
line = json.dumps(dict(item)) + "\n"
self.file.write(line)
return item
# define a spider
class DmozSpider(Spider):
name = "dmoz"
allowed_domains = ["dmoz.org"]
start_urls = [
"http://www.dmoz.org/Computers/Programming/Languages/Python/Books/",
"http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/"
]
def parse(self, response):
for sel in response.xpath('//ul/li'):
loader = DmozItemLoader(DmozItem(), selector=sel, response=response)
loader.add_xpath('title', 'a/text()')
loader.add_xpath('link', 'a/@href')
loader.add_xpath('desc', 'text()')
yield loader.load_item()
# callback fired when the spider is closed
def callback(spider, reason):
stats = spider.crawler.stats.get_stats() # collect/log stats?
# stop the reactor
reactor.stop()
# instantiate settings and provide a custom configuration
settings = Settings()
settings.set('ITEM_PIPELINES', {
'__main__.JsonWriterPipeline': 100
})
# instantiate a crawler passing in settings
crawler = Crawler(settings)
# instantiate a spider
spider = DmozSpider()
# configure signals
crawler.signals.connect(callback, signal=signals.spider_closed)
# configure and start the crawler
crawler.configure()
crawler.crawl(spider)
crawler.start()
# start logging
log.start()
# start the reactor (blocks execution)
reactor.run()
@frankjdelgado
Copy link

scrapy.contrib is deprecated, they recommend to use this one instead:

from scrapy.loader import ItemLoader
from scrapy.loader.processors import TakeFirst, MapCompose, Join

@michaelbukachi
Copy link

You could use this also:
process = CrawlerProcess({ 'ITEM_PIPELINES', { '__main__.JsonWriterPipeline': 100} })
process.crawl(DmozSpider)
process.start()

No need for a callback or starting/stopping the reactor manually

@dashw00d
Copy link

This was exactly what I was looking for!!! I've been wanting to transition my Scrapy projects to stand-alone for a while now. Thanks!!!
Here's how I ended up doing the settings:

settings = Settings({
    # piplines start with the project/module name so replace with __main__
    'ITEM_PIPELINES': {
        '__main__.WriterPipeline': 100,
    },
    'DOWNLOADER_MIDDLEWARES': {
        'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
    }
})
process = CrawlerProcess(settings)

# you can run 30 of these at once if you want, e.g —
# process.crawl(CustomSpider)
# process.crawl(CustomSpider) etc.. * 30
process.crawl(CustomSpider)
process.start()

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment