pedrokoblitz/ajax_selenium_spider.py

## ajax_selenium_spider.py
# Many times when crawling we run into problems where content that is rendered on the page is generated with Javascript and therefore scrapy is unable to crawl for it (eg. ajax requests, jQuery craziness).  However, if you use Scrapy along with the web testing framework Selenium then we are able to crawl anything displayed in a normal web browser.
#
# Some things to note:
# You must have the Python version of Selenium RC installed for this to work, and you must have set up Selenium properly.  Also this is just a template crawler.  You could get much crazier and more advanced with things but I just wanted to show the basic idea.  As the code stands now you will be doing two requests for any given url.  One request is made by Scrapy and the other is made by Selenium.  I am sure there are ways around this so that you could possibly just make Selenium do the one and only request but I did not bother to implement that and by doing two requests you get to crawl the page with Scrapy too.
#
# This is quite powerful because now you have the entire rendered DOM available for you to crawl and you can still use all the nice crawling features in Scrapy.  This will make for slower crawling of course but depending on how much you need the rendered DOM it might be worth the wait.

from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from scrapy.http import Request

from selenium import selenium

class SeleniumSpider(CrawlSpider):
	name = "SeleniumSpider"
	start_urls = ["http://www.domain.com"]

	rules = (
		Rule(SgmlLinkExtractor(allow=('\.html', )), callback='parse_page',follow=True),
	)

	def __init__(self):
		CrawlSpider.__init__(self)
		self.verificationErrors = []
		self.selenium = selenium("localhost", 4444, "*chrome", "http://www.domain.com")
		self.selenium.start()

	def __del__(self):
		self.selenium.stop()
		print self.verificationErrors
		CrawlSpider.__del__(self)

	def parse_page(self, response):
		item = Item()

		hxs = HtmlXPathSelector(response)
		#Do some XPath selection with Scrapy
		hxs.select('//div').extract()

		sel = self.selenium
		sel.open(response.url)

		#Wait for javscript to load in Selenium
		time.sleep(2.5)

		#Do some crawling of javascript created content with Selenium
		sel.get_text("//div")
		yield item

# Snippet imported from snippets.scrapy.org (which no longer works)
# author: wynbennett
# date  : Jun 21, 2011

## ajax_webdriver_spider.py
# This is a piece of code that use webdrivers to load&render a page with Scrapy and Selenium.
#
# This work is based on the snippets [wynbennett](http://snippets.scrapy.org/users/wynbennett/) [posted here](http://snippets.scrapy.org/snippets/21/) some time ago

from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from scrapy.http import Request
from myItem.items import myItem
from selenium import webdriver
from selenium.webdriver.firefox.firefox_profile import FirefoxProfile

import time
import pprint

class WebDriverSpider(CrawlSpider):
    name = "WebDriverSpider"
    start_urls = ["http://yourDomain.com/yourUrl.html"]

    rules = (
        Rule(SgmlLinkExtractor(allow=('\.html', ), allow_domains=('yourDomain.com', )), callback='parse_page',follow=False),
        )

    def __init__(self):
        CrawlSpider.__init__(self)
        self.verificationErrors = []
        #create a profile with specific add-ons
        #and do this. Firefox to load it
        profile = FirefoxProfile(profile_directory="/home/yourUser/.mozilla/firefox/selenium/")
        self.selenium = webdriver.Firefox(profile)

    def __del__(self):
        self.selenium.quit()
        print self.verificationErrors
        CrawlSpider.__del__(self)

    def parse_page(self, response):
        #normal scrapy result
        hxs = HtmlXPathSelector(response)
        #webdriver rendered page
        sel = self.selenium
        sel.get(response.url)

        if sel:
            #Wait for javascript to load in Selenium
            time.sleep(2.5)

        #Do some crawling of javascript created content with Selenium
        item = myItem()
        item['url'] = response.url
        item['title'] = hxs.select('//title/text()').extract()


        #something u can do only with webdrivers
        item['thatDiv'] = sel.find_element_by_id("thatDiv")

# Snippet imported from snippets.scrapy.org (which no longer works)
# author: rollsappletree
# date  : Aug 25, 2011

## crawler_thread.py
# When you run the Scrapy crawler from a program, the code blocks until the Scrapy crawler is finished. This is due to how Twisted (the underlying asynchronous network library) works. This prevents using the Scrapy crawler from scripts or other code.
#
# To circumvent this issue you can run the Scrapy crawler in a thread with this code.
#
# Keep in mind that this code is mainly for illustrative purposes and far from production ready.
#
# Also the code was only tested with Scrapy 0.8, and will probably need some adjustments for newer versions (since the core API isn't stable yet), but you get the idea.

"""
Code to run Scrapy crawler in a thread - works on Scrapy 0.8
"""

import threading, Queue

from twisted.internet import reactor

from scrapy.xlib.pydispatch import dispatcher
from scrapy.core.manager import scrapymanager
from scrapy.core.engine import scrapyengine
from scrapy.core import signals

class CrawlerThread(threading.Thread):

    def __init__(self):
        threading.Thread.__init__(self)
        self.running = False

    def run(self):
        self.running = True
        scrapymanager.configure(control_reactor=False)
        scrapymanager.start()
        reactor.run(installSignalHandlers=False)

    def crawl(self, *args):
        if not self.running:
            raise RuntimeError("CrawlerThread not running")
        self._call_and_block_until_signal(signals.spider_closed, \
            scrapymanager.crawl, *args)

    def stop(self):
        reactor.callFromThread(scrapyengine.stop)

    def _call_and_block_until_signal(self, signal, f, *a, **kw):
        q = Queue.Queue()
        def unblock():
            q.put(None)
        dispatcher.connect(unblock, signal=signal)
        reactor.callFromThread(f, *a, **kw)
        q.get()


# Usage example below:

import os
os.environ.setdefault('SCRAPY_SETTINGS_MODULE', 'myproject.settings')

from scrapy.xlib.pydispatch import dispatcher
from scrapy.core import signals
from scrapy.conf import settings
from scrapy.crawler import CrawlerThread

settings.overrides['LOG_ENABLED'] = False # avoid log noise

def item_passed(item):
    print "Just scraped item:", item

dispatcher.connect(item_passed, signal=signals.item_passed)

crawler = CrawlerThread()
print "Starting crawler thread..."
crawler.start()
print "Crawling somedomain.com...."
crawler.crawl('somedomain.com) # blocking call
print "Crawling anotherdomain.com..."
crawler.crawl('anotherdomain.com') # blocking call
print "Stopping crawler thread..."
crawler.stop()

# Snippet imported from snippets.scrapy.org (which no longer works)
# author: pablo
# date  : Aug 11, 2010

## ignore_visited.py
# This middleware can be used to avoid re-visiting already visited items, which can be useful for speeding up the scraping for projects with immutable items, ie. items that, once scraped, don't change.

from scrapy import log
from scrapy.http import Request
from scrapy.item import BaseItem
from scrapy.utils.request import request_fingerprint

from myproject.items import MyItem

class IgnoreVisitedItems(object):
    """Middleware to ignore re-visiting item pages if they were already visited
    before. The requests to be filtered by have a meta['filter_visited'] flag
    enabled and optionally define an id to use for identifying them, which
    defaults the request fingerprint, although you'd want to use the item id,
    if you already have it beforehand to make it more robust.
    """

    FILTER_VISITED = 'filter_visited'
    VISITED_ID = 'visited_id'
    CONTEXT_KEY = 'visited_ids'

    def process_spider_output(self, response, result, spider):
        context = getattr(spider, 'context', {})
        visited_ids = context.setdefault(self.CONTEXT_KEY, {})
        ret = []
        for x in result:
            visited = False
            if isinstance(x, Request):
                if self.FILTER_VISITED in x.meta:
                    visit_id = self._visited_id(x)
                    if visit_id in visited_ids:
                        log.msg("Ignoring already visited: %s" % x.url,
                                level=log.INFO, spider=spider)
                        visited = True
            elif isinstance(x, BaseItem):
                visit_id = self._visited_id(response.request)
                if visit_id:
                    visited_ids[visit_id] = True
                    x['visit_id'] = visit_id
                    x['visit_status'] = 'new'
            if visited:
                ret.append(MyItem(visit_id=visit_id, visit_status='old'))
            else:
                ret.append(x)
        return ret

    def _visited_id(self, request):
        return request.meta.get(self.VISITED_ID) or request_fingerprint(request)

# Snippet imported from snippets.scrapy.org (which no longer works)
# author: pablo
# date  : Aug 10, 2010

## mongo_pipeline.py
# Standard Python library imports

# 3rd party modules
import pymongo

from scrapy import log
from scrapy.conf import settings
from scrapy.exceptions import DropItem


class MongoDBPipeline(object):
    def __init__(self):
        self.server = settings['MONGODB_SERVER']
        self.port = settings['MONGODB_PORT']
        self.db = settings['MONGODB_DB']
        self.col = settings['MONGODB_COLLECTION']
        connection = pymongo.Connection(self.server, self.port)
        db = connection[self.db]
        self.collection = db[self.col]

    def process_item(self, item, spider):
        err_msg = ''
        for field, data in item.items():
            if not data:
                err_msg += 'Missing %s of poem from %s\n' % (field, item['url'])
        if err_msg:
            raise DropItem(err_msg)
        self.collection.insert(dict(item))
        log.msg('Item written to MongoDB database %s/%s' % (self.db, self.col),
                level=log.DEBUG, spider=spider)
        return item

## mysql_pipeline.py
# Cannot use this to create the table, must have table already created

from twisted.enterprise import adbapi
import datetime
import MySQLdb.cursors

class SQLStorePipeline(object):

    def __init__(self):
        self.dbpool = adbapi.ConnectionPool('MySQLdb', db='mydb',
                user='myuser', passwd='mypass', cursorclass=MySQLdb.cursors.DictCursor,
                charset='utf8', use_unicode=True)

    def process_item(self, item, spider):
        # run db query in thread pool
        query = self.dbpool.runInteraction(self._conditional_insert, item)
        query.addErrback(self.handle_error)

        return item

    def _conditional_insert(self, tx, item):
        # create record if doesn't exist.
        # all this block run on it's own thread
        tx.execute("select * from websites where link = %s", (item['link'][0], ))
        result = tx.fetchone()
        if result:
            log.msg("Item already stored in db: %s" % item, level=log.DEBUG)
        else:
            tx.execute(\
                "insert into websites (link, created) "
                "values (%s, %s)",
                (item['link'][0],
                 datetime.datetime.now())
            )
            log.msg("Item stored in db: %s" % item, level=log.DEBUG)

    def handle_error(self, e):
        log.err(e)

# Snippet imported from snippets.scrapy.org (which no longer works)
# author: redtricycle
# date  : Nov 21, 2011

## random_ua_middleware.py
# You can use this middleware to have a random user agent every request the spider makes.
# You can define a user USER_AGEN_LIST in your settings and the spider will chose a random user agent from that list every time.
#
# You will have to disable the default user agent middleware and add this to your settings file.
#
#     DOWNLOADER_MIDDLEWARES = {
#         'scraper.random_user_agent.RandomUserAgentMiddleware': 400,
#         'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware': None,
#     }

from scraper.settings import USER_AGENT_LIST
import random
from scrapy import log

class RandomUserAgentMiddleware(object):

    def process_request(self, request, spider):
        ua  = random.choice(USER_AGENT_LIST)
        if ua:
            request.headers.setdefault('User-Agent', ua)
        #log.msg('>>>> UA %s'%request.headers)

# Snippet imported from snippets.scrapy.org (which no longer works)
# author: dushyant
# date  : Sep 16, 2011

## self_contauned_spider.py
# This scripts shows how to crawl a site without settings up a complete project.
#
# Note: the `crawler.start()` can't be called more than once due twisted's reactor limitation.

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# author: Rolando Espinoza La fuente
#
# Changelog:
#     24/07/2011 - updated to work with scrapy 13.0dev
#     25/08/2010 - initial version. works with scrapy 0.9

from scrapy.contrib.loader import XPathItemLoader
from scrapy.item import Item, Field
from scrapy.selector import HtmlXPathSelector
from scrapy.spider import BaseSpider


class QuestionItem(Item):
    """Our SO Question Item"""
    title = Field()
    summary = Field()
    tags = Field()

    user = Field()
    posted = Field()

    votes = Field()
    answers = Field()
    views = Field()


class MySpider(BaseSpider):
    """Our ad-hoc spider"""
    name = "myspider"
    start_urls = ["http://stackoverflow.com/"]

    question_list_xpath = '//div[@id="content"]//div[contains(@class, "question-summary")]'

    def parse(self, response):
        hxs = HtmlXPathSelector(response)

        for qxs in hxs.select(self.question_list_xpath):
            loader = XPathItemLoader(QuestionItem(), selector=qxs)
            loader.add_xpath('title', './/h3/a/text()')
            loader.add_xpath('summary', './/h3/a/@title')
            loader.add_xpath('tags', './/a[@rel="tag"]/text()')
            loader.add_xpath('user', './/div[@class="started"]/a[2]/text()')
            loader.add_xpath('posted', './/div[@class="started"]/a[1]/span/@title')
            loader.add_xpath('votes', './/div[@class="votes"]/div[1]/text()')
            loader.add_xpath('answers', './/div[contains(@class, "answered")]/div[1]/text()')
            loader.add_xpath('views', './/div[@class="views"]/div[1]/text()')

            yield loader.load_item()


def main():
    """Setups item signal and run the spider"""
    # set up signal to catch items scraped
    from scrapy import signals
    from scrapy.xlib.pydispatch import dispatcher

    def catch_item(sender, item, **kwargs):
        print "Got:", item

    dispatcher.connect(catch_item, signal=signals.item_passed)

    # shut off log
    from scrapy.conf import settings
    settings.overrides['LOG_ENABLED'] = False

    # set up crawler
    from scrapy.crawler import CrawlerProcess

    crawler = CrawlerProcess(settings)
    crawler.install()
    crawler.configure()

    # schedule spider
    crawler.crawl(MySpider())

    # start engine scrapy/twisted
    print "STARTING ENGINE"
    crawler.start()
    print "ENGINE STOPPED"


if __name__ == '__main__':
    main()

# Snippet imported from snippets.scrapy.org (which no longer works)
# author: darkrho
# date  : Aug 25, 2010

## simple_spider.py
# Standard Python library imports

# 3rd party imports
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector

# My imports
from poetry_analysis.items import PoetryAnalysisItem

HTML_FILE_NAME = r'.+\.html'

class PoetryParser(object):
    """
    Provides common parsing method for poems formatted this one specific way.
    """
    date_pattern = r'(\d{2} \w{3,9} \d{4})'

    def parse_poem(self, response):
        hxs = HtmlXPathSelector(response)
        item = PoetryAnalysisItem()
        # All poetry text is in pre tags
        text = hxs.select('//pre/text()').extract()
        item['text'] = ''.join(text)
        item['url'] = response.url
        # head/title contains title - a poem by author
        title_text = hxs.select('//head/title/text()').extract()[0]
        item['title'], item['author'] = title_text.split(' - ')
        item['author'] = item['author'].replace('a poem by', '')
        for key in ['title', 'author']:
            item[key] = item[key].strip()
        item['date'] = hxs.select("//p[@class='small']/text()").re(date_pattern)
        return item


class PoetrySpider(CrawlSpider, PoetryParser):
    name = 'example.com_poetry'
    allowed_domains = ['www.example.com']
    root_path = 'someuser/poetry/'
    start_urls = ['http://www.example.com/someuser/poetry/recent/',
                  'http://www.example.com/someuser/poetry/less_recent/']
    rules = [Rule(SgmlLinkExtractor(allow=[start_urls[0] + HTML_FILE_NAME]),
                                    callback='parse_poem'),
             Rule(SgmlLinkExtractor(allow=[start_urls[1] + HTML_FILE_NAME]),
                                    callback='parse_poem')]
	# Many times when crawling we run into problems where content that is rendered on the page is generated with Javascript and therefore scrapy is unable to crawl for it (eg. ajax requests, jQuery craziness). However, if you use Scrapy along with the web testing framework Selenium then we are able to crawl anything displayed in a normal web browser.
	#
	# Some things to note:
	# You must have the Python version of Selenium RC installed for this to work, and you must have set up Selenium properly. Also this is just a template crawler. You could get much crazier and more advanced with things but I just wanted to show the basic idea. As the code stands now you will be doing two requests for any given url. One request is made by Scrapy and the other is made by Selenium. I am sure there are ways around this so that you could possibly just make Selenium do the one and only request but I did not bother to implement that and by doing two requests you get to crawl the page with Scrapy too.
	#
	# This is quite powerful because now you have the entire rendered DOM available for you to crawl and you can still use all the nice crawling features in Scrapy. This will make for slower crawling of course but depending on how much you need the rendered DOM it might be worth the wait.

	from scrapy.contrib.spiders import CrawlSpider, Rule
	from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
	from scrapy.selector import HtmlXPathSelector
	from scrapy.http import Request

	from selenium import selenium

	class SeleniumSpider(CrawlSpider):
	name = "SeleniumSpider"
	start_urls = ["http://www.domain.com"]

	rules = (
	Rule(SgmlLinkExtractor(allow=('\.html', )), callback='parse_page',follow=True),
	)

	def __init__(self):
	CrawlSpider.__init__(self)
	self.verificationErrors = []
	self.selenium = selenium("localhost", 4444, "*chrome", "http://www.domain.com")
	self.selenium.start()

	def __del__(self):
	self.selenium.stop()
	print self.verificationErrors
	CrawlSpider.__del__(self)

	def parse_page(self, response):
	item = Item()

	hxs = HtmlXPathSelector(response)
	#Do some XPath selection with Scrapy
	hxs.select('//div').extract()

	sel = self.selenium
	sel.open(response.url)

	#Wait for javscript to load in Selenium
	time.sleep(2.5)

	#Do some crawling of javascript created content with Selenium
	sel.get_text("//div")
	yield item

	# Snippet imported from snippets.scrapy.org (which no longer works)
	# author: wynbennett
	# date : Jun 21, 2011
	# This is a piece of code that use webdrivers to load&render a page with Scrapy and Selenium.
	#
	# This work is based on the snippets [wynbennett](http://snippets.scrapy.org/users/wynbennett/) [posted here](http://snippets.scrapy.org/snippets/21/) some time ago

	from scrapy.contrib.spiders import CrawlSpider, Rule
	from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
	from scrapy.selector import HtmlXPathSelector
	from scrapy.http import Request
	from myItem.items import myItem
	from selenium import webdriver
	from selenium.webdriver.firefox.firefox_profile import FirefoxProfile

	import time
	import pprint

	class WebDriverSpider(CrawlSpider):
	name = "WebDriverSpider"
	start_urls = ["http://yourDomain.com/yourUrl.html"]

	rules = (
	Rule(SgmlLinkExtractor(allow=('\.html', ), allow_domains=('yourDomain.com', )), callback='parse_page',follow=False),
	)

	def __init__(self):
	CrawlSpider.__init__(self)
	self.verificationErrors = []
	#create a profile with specific add-ons
	#and do this. Firefox to load it
	profile = FirefoxProfile(profile_directory="/home/yourUser/.mozilla/firefox/selenium/")
	self.selenium = webdriver.Firefox(profile)

	def __del__(self):
	self.selenium.quit()
	print self.verificationErrors
	CrawlSpider.__del__(self)

	def parse_page(self, response):
	#normal scrapy result
	hxs = HtmlXPathSelector(response)
	#webdriver rendered page
	sel = self.selenium
	sel.get(response.url)

	if sel:
	#Wait for javascript to load in Selenium
	time.sleep(2.5)

	#Do some crawling of javascript created content with Selenium
	item = myItem()
	item['url'] = response.url
	item['title'] = hxs.select('//title/text()').extract()


	#something u can do only with webdrivers
	item['thatDiv'] = sel.find_element_by_id("thatDiv")

	# Snippet imported from snippets.scrapy.org (which no longer works)
	# author: rollsappletree
	# date : Aug 25, 2011
	# When you run the Scrapy crawler from a program, the code blocks until the Scrapy crawler is finished. This is due to how Twisted (the underlying asynchronous network library) works. This prevents using the Scrapy crawler from scripts or other code.
	#
	# To circumvent this issue you can run the Scrapy crawler in a thread with this code.
	#
	# Keep in mind that this code is mainly for illustrative purposes and far from production ready.
	#
	# Also the code was only tested with Scrapy 0.8, and will probably need some adjustments for newer versions (since the core API isn't stable yet), but you get the idea.

	"""
	Code to run Scrapy crawler in a thread - works on Scrapy 0.8
	"""

	import threading, Queue

	from twisted.internet import reactor

	from scrapy.xlib.pydispatch import dispatcher
	from scrapy.core.manager import scrapymanager
	from scrapy.core.engine import scrapyengine
	from scrapy.core import signals

	class CrawlerThread(threading.Thread):

	def __init__(self):
	threading.Thread.__init__(self)
	self.running = False

	def run(self):
	self.running = True
	scrapymanager.configure(control_reactor=False)
	scrapymanager.start()
	reactor.run(installSignalHandlers=False)

	def crawl(self, *args):
	if not self.running:
	raise RuntimeError("CrawlerThread not running")
	self._call_and_block_until_signal(signals.spider_closed, \
	scrapymanager.crawl, *args)

	def stop(self):
	reactor.callFromThread(scrapyengine.stop)

	def _call_and_block_until_signal(self, signal, f, a, *kw):
	q = Queue.Queue()
	def unblock():
	q.put(None)
	dispatcher.connect(unblock, signal=signal)
	reactor.callFromThread(f, a, *kw)
	q.get()


	# Usage example below:

	import os
	os.environ.setdefault('SCRAPY_SETTINGS_MODULE', 'myproject.settings')

	from scrapy.xlib.pydispatch import dispatcher
	from scrapy.core import signals
	from scrapy.conf import settings
	from scrapy.crawler import CrawlerThread

	settings.overrides['LOG_ENABLED'] = False # avoid log noise

	def item_passed(item):
	print "Just scraped item:", item

	dispatcher.connect(item_passed, signal=signals.item_passed)

	crawler = CrawlerThread()
	print "Starting crawler thread..."
	crawler.start()
	print "Crawling somedomain.com...."
	crawler.crawl('somedomain.com) # blocking call
	print "Crawling anotherdomain.com..."
	crawler.crawl('anotherdomain.com') # blocking call
	print "Stopping crawler thread..."
	crawler.stop()

	# Snippet imported from snippets.scrapy.org (which no longer works)
	# author: pablo
	# date : Aug 11, 2010
	# This middleware can be used to avoid re-visiting already visited items, which can be useful for speeding up the scraping for projects with immutable items, ie. items that, once scraped, don't change.

	from scrapy import log
	from scrapy.http import Request
	from scrapy.item import BaseItem
	from scrapy.utils.request import request_fingerprint

	from myproject.items import MyItem

	class IgnoreVisitedItems(object):
	"""Middleware to ignore re-visiting item pages if they were already visited
	before. The requests to be filtered by have a meta['filter_visited'] flag
	enabled and optionally define an id to use for identifying them, which
	defaults the request fingerprint, although you'd want to use the item id,
	if you already have it beforehand to make it more robust.
	"""

	FILTER_VISITED = 'filter_visited'
	VISITED_ID = 'visited_id'
	CONTEXT_KEY = 'visited_ids'

	def process_spider_output(self, response, result, spider):
	context = getattr(spider, 'context', {})
	visited_ids = context.setdefault(self.CONTEXT_KEY, {})
	ret = []
	for x in result:
	visited = False
	if isinstance(x, Request):
	if self.FILTER_VISITED in x.meta:
	visit_id = self._visited_id(x)
	if visit_id in visited_ids:
	log.msg("Ignoring already visited: %s" % x.url,
	level=log.INFO, spider=spider)
	visited = True
	elif isinstance(x, BaseItem):
	visit_id = self._visited_id(response.request)
	if visit_id:
	visited_ids[visit_id] = True
	x['visit_id'] = visit_id
	x['visit_status'] = 'new'
	if visited:
	ret.append(MyItem(visit_id=visit_id, visit_status='old'))
	else:
	ret.append(x)
	return ret

	def _visited_id(self, request):
	return request.meta.get(self.VISITED_ID) or request_fingerprint(request)

	# Snippet imported from snippets.scrapy.org (which no longer works)
	# author: pablo
	# date : Aug 10, 2010
	# Standard Python library imports

	# 3rd party modules
	import pymongo

	from scrapy import log
	from scrapy.conf import settings
	from scrapy.exceptions import DropItem


	class MongoDBPipeline(object):
	def __init__(self):
	self.server = settings['MONGODB_SERVER']
	self.port = settings['MONGODB_PORT']
	self.db = settings['MONGODB_DB']
	self.col = settings['MONGODB_COLLECTION']
	connection = pymongo.Connection(self.server, self.port)
	db = connection[self.db]
	self.collection = db[self.col]

	def process_item(self, item, spider):
	err_msg = ''
	for field, data in item.items():
	if not data:
	err_msg += 'Missing %s of poem from %s\n' % (field, item['url'])
	if err_msg:
	raise DropItem(err_msg)
	self.collection.insert(dict(item))
	log.msg('Item written to MongoDB database %s/%s' % (self.db, self.col),
	level=log.DEBUG, spider=spider)
	return item
	# Cannot use this to create the table, must have table already created

	from twisted.enterprise import adbapi
	import datetime
	import MySQLdb.cursors

	class SQLStorePipeline(object):

	def __init__(self):
	self.dbpool = adbapi.ConnectionPool('MySQLdb', db='mydb',
	user='myuser', passwd='mypass', cursorclass=MySQLdb.cursors.DictCursor,
	charset='utf8', use_unicode=True)

	def process_item(self, item, spider):
	# run db query in thread pool
	query = self.dbpool.runInteraction(self._conditional_insert, item)
	query.addErrback(self.handle_error)

	return item

	def _conditional_insert(self, tx, item):
	# create record if doesn't exist.
	# all this block run on it's own thread
	tx.execute("select * from websites where link = %s", (item['link'][0], ))
	result = tx.fetchone()
	if result:
	log.msg("Item already stored in db: %s" % item, level=log.DEBUG)
	else:
	tx.execute(\
	"insert into websites (link, created) "
	"values (%s, %s)",
	(item['link'][0],
	datetime.datetime.now())
	)
	log.msg("Item stored in db: %s" % item, level=log.DEBUG)

	def handle_error(self, e):
	log.err(e)

	# Snippet imported from snippets.scrapy.org (which no longer works)
	# author: redtricycle
	# date : Nov 21, 2011
	# You can use this middleware to have a random user agent every request the spider makes.
	# You can define a user USER_AGEN_LIST in your settings and the spider will chose a random user agent from that list every time.
	#
	# You will have to disable the default user agent middleware and add this to your settings file.
	#
	# DOWNLOADER_MIDDLEWARES = {
	# 'scraper.random_user_agent.RandomUserAgentMiddleware': 400,
	# 'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware': None,
	# }

	from scraper.settings import USER_AGENT_LIST
	import random
	from scrapy import log

	class RandomUserAgentMiddleware(object):

	def process_request(self, request, spider):
	ua = random.choice(USER_AGENT_LIST)
	if ua:
	request.headers.setdefault('User-Agent', ua)
	#log.msg('>>>> UA %s'%request.headers)

	# Snippet imported from snippets.scrapy.org (which no longer works)
	# author: dushyant
	# date : Sep 16, 2011
	# This scripts shows how to crawl a site without settings up a complete project.
	#
	# Note: the `crawler.start()` can't be called more than once due twisted's reactor limitation.

	#!/usr/bin/env python
	# -- coding: utf-8 --
	# author: Rolando Espinoza La fuente
	#
	# Changelog:
	# 24/07/2011 - updated to work with scrapy 13.0dev
	# 25/08/2010 - initial version. works with scrapy 0.9

	from scrapy.contrib.loader import XPathItemLoader
	from scrapy.item import Item, Field
	from scrapy.selector import HtmlXPathSelector
	from scrapy.spider import BaseSpider


	class QuestionItem(Item):
	"""Our SO Question Item"""
	title = Field()
	summary = Field()
	tags = Field()

	user = Field()
	posted = Field()

	votes = Field()
	answers = Field()
	views = Field()


	class MySpider(BaseSpider):
	"""Our ad-hoc spider"""
	name = "myspider"
	start_urls = ["http://stackoverflow.com/"]

	question_list_xpath = '//div[@id="content"]//div[contains(@class, "question-summary")]'

	def parse(self, response):
	hxs = HtmlXPathSelector(response)

	for qxs in hxs.select(self.question_list_xpath):
	loader = XPathItemLoader(QuestionItem(), selector=qxs)
	loader.add_xpath('title', './/h3/a/text()')
	loader.add_xpath('summary', './/h3/a/@title')
	loader.add_xpath('tags', './/a[@rel="tag"]/text()')
	loader.add_xpath('user', './/div[@class="started"]/a[2]/text()')
	loader.add_xpath('posted', './/div[@class="started"]/a[1]/span/@title')
	loader.add_xpath('votes', './/div[@class="votes"]/div[1]/text()')
	loader.add_xpath('answers', './/div[contains(@class, "answered")]/div[1]/text()')
	loader.add_xpath('views', './/div[@class="views"]/div[1]/text()')

	yield loader.load_item()


	def main():
	"""Setups item signal and run the spider"""
	# set up signal to catch items scraped
	from scrapy import signals
	from scrapy.xlib.pydispatch import dispatcher

	def catch_item(sender, item, **kwargs):
	print "Got:", item

	dispatcher.connect(catch_item, signal=signals.item_passed)

	# shut off log
	from scrapy.conf import settings
	settings.overrides['LOG_ENABLED'] = False

	# set up crawler
	from scrapy.crawler import CrawlerProcess

	crawler = CrawlerProcess(settings)
	crawler.install()
	crawler.configure()

	# schedule spider
	crawler.crawl(MySpider())

	# start engine scrapy/twisted
	print "STARTING ENGINE"
	crawler.start()
	print "ENGINE STOPPED"


	if __name__ == '__main__':
	main()

	# Snippet imported from snippets.scrapy.org (which no longer works)
	# author: darkrho
	# date : Aug 25, 2010
	# Standard Python library imports

	# 3rd party imports
	from scrapy.contrib.spiders import CrawlSpider, Rule
	from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
	from scrapy.selector import HtmlXPathSelector

	# My imports
	from poetry_analysis.items import PoetryAnalysisItem

	HTML_FILE_NAME = r'.+\.html'

	class PoetryParser(object):
	"""
	Provides common parsing method for poems formatted this one specific way.
	"""
	date_pattern = r'(\d{2} \w{3,9} \d{4})'

	def parse_poem(self, response):
	hxs = HtmlXPathSelector(response)
	item = PoetryAnalysisItem()
	# All poetry text is in pre tags
	text = hxs.select('//pre/text()').extract()
	item['text'] = ''.join(text)
	item['url'] = response.url
	# head/title contains title - a poem by author
	title_text = hxs.select('//head/title/text()').extract()[0]
	item['title'], item['author'] = title_text.split(' - ')
	item['author'] = item['author'].replace('a poem by', '')
	for key in ['title', 'author']:
	item[key] = item[key].strip()
	item['date'] = hxs.select("//p[@class='small']/text()").re(date_pattern)
	return item


	class PoetrySpider(CrawlSpider, PoetryParser):
	name = 'example.com_poetry'
	allowed_domains = ['www.example.com']
	root_path = 'someuser/poetry/'
	start_urls = ['http://www.example.com/someuser/poetry/recent/',
	'http://www.example.com/someuser/poetry/less_recent/']
	rules = [Rule(SgmlLinkExtractor(allow=[start_urls[0] + HTML_FILE_NAME]),
	callback='parse_poem'),
	Rule(SgmlLinkExtractor(allow=[start_urls[1] + HTML_FILE_NAME]),
	callback='parse_poem')]