Skip to content

Instantly share code, notes, and snippets.

@peterfarrell
Forked from benzkji/search_indexes.py
Created January 9, 2019 17:15
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save peterfarrell/2b511718c95f5806e3f2a8f1cd241fcd to your computer and use it in GitHub Desktop.
Save peterfarrell/2b511718c95f5806e3f2a8f1cd241fcd to your computer and use it in GitHub Desktop.
haystack index for django-filer PDFs
# coding: utf-8
from __future__ import unicode_literals
import os
from pdfminer.pdfdocument import PDFEncryptionError
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from cStringIO import StringIO
from haystack import indexes
from filer.models import File
class PDFIndex(indexes.SearchIndex, indexes.Indexable):
text = indexes.CharField(document=True)
title = indexes.CharField()
url = indexes.CharField()
counter = 0
# target = indexes.CharField()
def get_model(self):
return File
def index_queryset(self, using=None):
"""Used when the entire index for model is updated."""
return self.get_model().objects.filter(file__endswith='.pdf', )
# original_filename__contains="Geschäftsbericht")
def get_updated_field(self):
return "modified_at"
def prepare_text(self, object):
self.counter += 1
print "--- %s ---------------------" % self.counter
print object
print object.url
text = convert_pdf_to_txt(object.path)
document = "%s %s %s" % (self.prepare_title(object), self.prepare_url(object), text)
print "--- ok -----"
# print document
return document
def prepare_title(self, object):
title = object.file.name
# print title
return title
def prepare_url(self, object):
url = object.url
# print url
return url
def convert_pdf_to_txt(path):
if not os.path.isfile(path):
print "file not existing: %s" % path
return ''
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
fp = file(path, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
pagenos = set()
try:
pages = PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,
caching=caching, check_extractable=True)
for page in pages:
interpreter.process_page(page)
except UnicodeDecodeError:
print "ENCRYPTED PDF DETECTED (probably, but no unknown encryption)!"
return ''
except PDFEncryptionError:
print "UNKNOWN ENCRYPTION DETECTED"
return ''
text = retstr.getvalue()
text = unicode(text, 'utf-8')
fp.close()
device.close()
retstr.close()
return text
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment