acdha/solr_extraction_backend.py

## solr_extraction_backend.py
# encoding: utf-8
import logging

from poster.encode import multipart_encode

from pysolr import SolrError, safe_urlencode

from haystack.backends.solr_backend import *
# For sanity:
from haystack.backends.solr_backend import SearchBackend as StandardSolrBackend


class ExtractionSearchBackend(StandardSolrBackend):
    def update(self, index, iterable, commit=True):
        # opt-in to non-standard handling:
        if not getattr(index, "use_extraction_handler", False):
            return super(SearchBackend, self).update(index, iterable,
                                                        commit=commit)

        # Unlike the default backend, we want to prepare the document like
        # normal but must then post it to a completely different handler with
        # a completely different call signature - see
        # http://wiki.apache.org/solr/ExtractingRequestHandler for details.
        #
        # In practice this means posting all of the non-text fields using
        # URL parameters and passing the file contents as multipart data

        for obj in iterable:
            try:
                self.extract(obj, index=index)
            except (UnicodeDecodeError, IOError), e:
                logging.error("Unable to update index for %s: %s", obj, e,
                                exc_info=e)

        if commit:
            self.conn.commit()

    def extract(self, obj, index=None):
        params = {
            "boost": index.get_field_weights(),
        }

        try:
            metadata = index.full_prepare(obj)
        except UnicodeDecodeError, e:
            logging.error("Error preparing metadata for %r: %s", obj, e,
                            exc_info=e)
            return

        # We don't actually want to use the prepared text field here:
        metadata.pop("text", None)

        for k, v in metadata.items():
            params["literal.%s" % k] = v

        # FIXME: Actually open source URL rather than reading test data:
        params["source_file"] = open("qt5nz25228.pdf", "rb")

        body_generator, headers = multipart_encode(params)

        try:
            self.conn._send_request('POST',
                                    "%s/update/extract" % self.conn.path,
                                    "".join(body_generator), headers)
        except (IOError, SolrError), e:
            self.log.error("Failed to add documents to Solr: %s", e, exc_info=e)
            raise e

SearchBackend = ExtractionSearchBackend
	# encoding: utf-8
	import logging

	from poster.encode import multipart_encode

	from pysolr import SolrError, safe_urlencode

	from haystack.backends.solr_backend import *
	# For sanity:
	from haystack.backends.solr_backend import SearchBackend as StandardSolrBackend


	class ExtractionSearchBackend(StandardSolrBackend):
	def update(self, index, iterable, commit=True):
	# opt-in to non-standard handling:
	if not getattr(index, "use_extraction_handler", False):
	return super(SearchBackend, self).update(index, iterable,
	commit=commit)

	# Unlike the default backend, we want to prepare the document like
	# normal but must then post it to a completely different handler with
	# a completely different call signature - see
	# http://wiki.apache.org/solr/ExtractingRequestHandler for details.
	#
	# In practice this means posting all of the non-text fields using
	# URL parameters and passing the file contents as multipart data

	for obj in iterable:
	try:
	self.extract(obj, index=index)
	except (UnicodeDecodeError, IOError), e:
	logging.error("Unable to update index for %s: %s", obj, e,
	exc_info=e)

	if commit:
	self.conn.commit()

	def extract(self, obj, index=None):
	params = {
	"boost": index.get_field_weights(),
	}

	try:
	metadata = index.full_prepare(obj)
	except UnicodeDecodeError, e:
	logging.error("Error preparing metadata for %r: %s", obj, e,
	exc_info=e)
	return

	# We don't actually want to use the prepared text field here:
	metadata.pop("text", None)

	for k, v in metadata.items():
	params["literal.%s" % k] = v

	# FIXME: Actually open source URL rather than reading test data:
	params["source_file"] = open("qt5nz25228.pdf", "rb")

	body_generator, headers = multipart_encode(params)

	try:
	self.conn._send_request('POST',
	"%s/update/extract" % self.conn.path,
	"".join(body_generator), headers)
	except (IOError, SolrError), e:
	self.log.error("Failed to add documents to Solr: %s", e, exc_info=e)
	raise e

	SearchBackend = ExtractionSearchBackend