Skip to content

Instantly share code, notes, and snippets.

@widnyana
Forked from tzermias/pipelines.py
Last active November 23, 2015 06:57
Show Gist options
  • Save widnyana/47fcceddbdcc90e40593 to your computer and use it in GitHub Desktop.
Save widnyana/47fcceddbdcc90e40593 to your computer and use it in GitHub Desktop.
Scrapy MySQL pipeline.Just a mirror to the asynchronous MySQL pipeline.Copy-paste it directly to pipelines.py. Database credentials are stored in settings.py. Based on http://snipplr.com/view/66986/
import MySQLdb.cursors
from twisted.enterprise import adbapi
from scrapy.xlib.pydispatch import dispatcher
from scrapy import signals
from scrapy.utils.project import get_project_settings
from scrapy import log
SETTINGS = get_project_settings()
class MySQLPipeline(object):
@classmethod
def from_crawler(cls, crawler):
return cls(crawler.stats)
def __init__(self, stats):
#Instantiate DB
self.dbpool = adbapi.ConnectionPool ('MySQLdb',
host=SETTINGS['DB_HOST'],
user=SETTINGS['DB_USER'],
passwd=SETTINGS['DB_PASSWD'],
port=SETTINGS['DB_PORT'],
db=SETTINGS['DB_DB'],
charset='utf8',
use_unicode = True,
cursorclass=MySQLdb.cursors.DictCursor
)
self.stats = stats
dispatcher.connect(self.spider_closed, signals.spider_closed)
def spider_closed(self, spider):
""" Cleanup function, called after crawing has finished to close open
objects.
Close ConnectionPool. """
self.dbpool.close()
def process_item(self, item, spider):
query = self.dbpool.runInteraction(self._insert_record, item)
query.addErrback(self._handle_error)
return item
def _insert_record(self, tx, item):
result = tx.execute(
""" INSERT INTO table VALUES (1,2,3)"""
)
if result > 0:
self.stats.inc_value('database/items_added')
def _handle_error(self, e):
log.err(e)
# Add these on your settings.py file
#Database settings
DB_HOST = 'localhost'
DB_PORT = 3306
DB_USER = 'user'
DB_PASSWD = 'password'
DB_DB = 'database'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment