Skip to content

Instantly share code, notes, and snippets.

@acdha
Created December 28, 2010 20:46
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save acdha/757688 to your computer and use it in GitHub Desktop.
Save acdha/757688 to your computer and use it in GitHub Desktop.
django-haystack "hackend" which stuffs values in through the Solr extraction handler for rich content indexing
# encoding: utf-8
import logging
from poster.encode import multipart_encode
from pysolr import SolrError, safe_urlencode
from haystack.backends.solr_backend import *
# For sanity:
from haystack.backends.solr_backend import SearchBackend as StandardSolrBackend
class ExtractionSearchBackend(StandardSolrBackend):
def update(self, index, iterable, commit=True):
# opt-in to non-standard handling:
if not getattr(index, "use_extraction_handler", False):
return super(SearchBackend, self).update(index, iterable,
commit=commit)
# Unlike the default backend, we want to prepare the document like
# normal but must then post it to a completely different handler with
# a completely different call signature - see
# http://wiki.apache.org/solr/ExtractingRequestHandler for details.
#
# In practice this means posting all of the non-text fields using
# URL parameters and passing the file contents as multipart data
for obj in iterable:
try:
self.extract(obj, index=index)
except (UnicodeDecodeError, IOError), e:
logging.error("Unable to update index for %s: %s", obj, e,
exc_info=e)
if commit:
self.conn.commit()
def extract(self, obj, index=None):
params = {
"boost": index.get_field_weights(),
}
try:
metadata = index.full_prepare(obj)
except UnicodeDecodeError, e:
logging.error("Error preparing metadata for %r: %s", obj, e,
exc_info=e)
return
# We don't actually want to use the prepared text field here:
metadata.pop("text", None)
for k, v in metadata.items():
params["literal.%s" % k] = v
# FIXME: Actually open source URL rather than reading test data:
params["source_file"] = open("qt5nz25228.pdf", "rb")
body_generator, headers = multipart_encode(params)
try:
self.conn._send_request('POST',
"%s/update/extract" % self.conn.path,
"".join(body_generator), headers)
except (IOError, SolrError), e:
self.log.error("Failed to add documents to Solr: %s", e, exc_info=e)
raise e
SearchBackend = ExtractionSearchBackend
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment