Skip to content

Instantly share code, notes, and snippets.

@benzkji
Created December 8, 2015 07:15
Show Gist options
  • Save benzkji/e6f6ab28e65560856bfb to your computer and use it in GitHub Desktop.
Save benzkji/e6f6ab28e65560856bfb to your computer and use it in GitHub Desktop.
haystack index for django-filer PDFs
# coding: utf-8
from __future__ import unicode_literals
import os
from pdfminer.pdfdocument import PDFEncryptionError
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from cStringIO import StringIO
from haystack import indexes
from filer.models import File
class PDFIndex(indexes.SearchIndex, indexes.Indexable):
text = indexes.CharField(document=True)
title = indexes.CharField()
url = indexes.CharField()
counter = 0
# target = indexes.CharField()
def get_model(self):
return File
def index_queryset(self, using=None):
"""Used when the entire index for model is updated."""
return self.get_model().objects.filter(file__endswith='.pdf', )
# original_filename__contains="Geschäftsbericht")
def get_updated_field(self):
return "modified_at"
def prepare_text(self, object):
self.counter += 1
print "--- %s ---------------------" % self.counter
print object
print object.url
text = convert_pdf_to_txt(object.path)
document = "%s %s %s" % (self.prepare_title(object), self.prepare_url(object), text)
print "--- ok -----"
# print document
return document
def prepare_title(self, object):
title = object.file.name
# print title
return title
def prepare_url(self, object):
url = object.url
# print url
return url
def convert_pdf_to_txt(path):
if not os.path.isfile(path):
print "file not existing: %s" % path
return ''
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
fp = file(path, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
pagenos = set()
try:
pages = PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,
caching=caching, check_extractable=True)
for page in pages:
interpreter.process_page(page)
except UnicodeDecodeError:
print "ENCRYPTED PDF DETECTED (probably, but no unknown encryption)!"
return ''
except PDFEncryptionError:
print "UNKNOWN ENCRYPTION DETECTED"
return ''
text = retstr.getvalue()
text = unicode(text, 'utf-8')
fp.close()
device.close()
retstr.close()
return text
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment