Skip to content

Instantly share code, notes, and snippets.

@benzkji
Last active June 20, 2019 08:02
Show Gist options
  • Save benzkji/6726fb52cad9824809687fa2a824a3c5 to your computer and use it in GitHub Desktop.
Save benzkji/6726fb52cad9824809687fa2a824a3c5 to your computer and use it in GitHub Desktop.
filer haystack pdf integration
# full of prints, forgive me ;-)
# coding: utf-8
from __future__ import unicode_literals
import os
from pdfminer.pdfdocument import PDFEncryptionError
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from StringIO import StringIO
from haystack import indexes
from filer.models import File
class PDFIndex(indexes.SearchIndex, indexes.Indexable):
text = indexes.CharField(document=True)
title = indexes.CharField()
type = indexes.CharField()
url = indexes.CharField()
counter = 0
# target = indexes.CharField()
def get_model(self):
return File
def index_queryset(self, using=None):
"""Used when the entire index for model is updated."""
return self.get_model().objects.filter(file__endswith='.pdf', )
# original_filename__contains="Geschäftsbericht")
def get_updated_field(self):
return "modified_at"
def prepare_text(self, object):
self.counter += 1
print "-- %s ---------------------" % self.counter
print object
print object.url
text = convert_pdf_to_txt(object.path)
document = "%s %s %s" % (self.prepare_title(object), self.prepare_url(object), text)
print "---"
# print document
return document
def prepare_title(self, object):
title = object.original_filename
# print title
return title
def prepare_type(self, object):
return 'PDF'
def prepare_url(self, object):
url = object.url
# print url
return url
def convert_pdf_to_txt(path):
# return "pdf test"
if not os.path.isfile(path):
print "file not existing: %s" % path
return ''
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
fp = file(path, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
pagenos = set()
try:
pages = PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,
caching=caching, check_extractable=True)
for page in pages:
interpreter.process_page(page)
except UnicodeDecodeError:
print "ENCRYPTED PDF DETECTED (probably)!"
return ''
except PDFEncryptionError:
print "UNKNOWN ENCRYPTION DETECTED"
return ''
text = retstr.getvalue()
text = unicode(text, 'utf-8')
fp.close()
device.close()
retstr.close()
return text
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment