Skip to content

Instantly share code, notes, and snippets.

@tzermias
Last active March 14, 2024 23:35
Show Gist options
  • Star 28 You must be signed in to star a gist
  • Fork 9 You must be signed in to fork a gist
  • Save tzermias/6982723 to your computer and use it in GitHub Desktop.
Save tzermias/6982723 to your computer and use it in GitHub Desktop.
Scrapy MySQL pipeline. Just a mirror to the asynchronous MySQL pipeline. Copy-paste it directly to pipelines.py. Database credentials are stored in settings.py. Based on http://snipplr.com/view/66986/
import MySQLdb.cursors
from twisted.enterprise import adbapi
from scrapy.xlib.pydispatch import dispatcher
from scrapy import signals
from scrapy.utils.project import get_project_settings
from scrapy import log
SETTINGS = get_project_settings()
class MySQLPipeline(object):
@classmethod
def from_crawler(cls, crawler):
return cls(crawler.stats)
def __init__(self, stats):
#Instantiate DB
self.dbpool = adbapi.ConnectionPool ('MySQLdb',
host=SETTINGS['DB_HOST'],
user=SETTINGS['DB_USER'],
passwd=SETTINGS['DB_PASSWD'],
port=SETTINGS['DB_PORT'],
db=SETTINGS['DB_DB'],
charset='utf8',
use_unicode = True,
cursorclass=MySQLdb.cursors.DictCursor
)
self.stats = stats
dispatcher.connect(self.spider_closed, signals.spider_closed)
def spider_closed(self, spider):
""" Cleanup function, called after crawing has finished to close open
objects.
Close ConnectionPool. """
self.dbpool.close()
def process_item(self, item, spider):
query = self.dbpool.runInteraction(self._insert_record, item)
query.addErrback(self._handle_error)
return item
def _insert_record(self, tx, item):
result = tx.execute(
""" INSERT INTO table VALUES (1,2,3)"""
)
if result > 0:
self.stats.inc_value('database/items_added')
def _handle_error(self, e):
log.err(e)
# Add these on your settings.py file
#Database settings
DB_HOST = 'localhost'
DB_PORT = 3306
DB_USER = 'user'
DB_PASSWD = 'password'
DB_DB = 'database'
@wmadaus
Copy link

wmadaus commented Jan 15, 2016

Shouldn't _insert_record and _handle_error be part of the pipeline class?

@Lawrence-Liu
Copy link

Agreed with wmadaus

@deathemperor
Copy link

he actually wrote them part of the pipeline class (look at the calls) but the indentation was wrong.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment