Skip to content

Instantly share code, notes, and snippets.

@StasDeep
Created June 9, 2020 16:24
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save StasDeep/236922e448ac33b354cf5ea6612e8fde to your computer and use it in GitHub Desktop.
Save StasDeep/236922e448ac33b354cf5ea6612e8fde to your computer and use it in GitHub Desktop.
Duplicated feed logs with BlockingFeedStorage
from time import sleep
import scrapy
from scrapy.extensions.feedexport import BlockingFeedStorage
class GsFeedStorage(BlockingFeedStorage):
def __init__(self, uri):
self.uri = uri
def _store_in_thread(self, file):
sleep(2)
class QuotesSpider(scrapy.Spider):
name = 'quotes'
start_urls = ['http://quotes.toscrape.com/']
custom_settings = {
'FEEDS': {
'gs://bucket/output.json': {'format': 'json'},
'gs://bucket/output.csv': {'format': 'csv'}
},
'FEED_STORAGES': {'gs': 'test.GsFeedStorage'}
}
def parse(self, response):
for quote in response.css('div.quote'):
yield {
'text': quote.css('.text::text').get(),
'author': quote.css('.author::text').get(),
'tags': quote.css('a.tag::text').getall()
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment