Skip to content

Instantly share code, notes, and snippets.

@marianmoldovan
Created March 6, 2015 09:07
Show Gist options
  • Save marianmoldovan/03996da2151586a6e600 to your computer and use it in GitHub Desktop.
Save marianmoldovan/03996da2151586a6e600 to your computer and use it in GitHub Desktop.
import pymongo
from scrapy.exceptions import DropItem
from scrapy.conf import settings
from scrapy import log
class MongoDBPipeline(object):
def __init__(self):
connection = pymongo.Connection(settings['MONGODB_SERVER'], settings['MONGODB_PORT'])
db = connection[settings['MONGODB_DB']]
self.collection = db[settings['MONGODB_COLLECTION']]
def process_item(self, item, spider):
valid = True
for data in item:
# here we only check if the data is not null
# but we could do any crazy validation we want
if not data:
valid = False
raise DropItem("Missing %s of blogpost from %s" %(data, item['url']))
if valid:
self.collection.insert(dict(item))
log.msg("Item wrote to MongoDB database %s/%s" %
(settings['MONGODB_DB'], settings['MONGODB_COLLECTION']),
level=log.DEBUG, spider=spider)
return item
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment