Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Text extraction processor for Paperclip to read PDF files
def extract_text
file = File.open("#{pdf.queued_for_write[:text].path}","r")
plain_text = ""
while (line = file.gets)
# plain_text << line.gsub(/[^\x00-\x7F]/n,'').gsub(/`/,"'").gsub(/^\d+$/,'').to_s + "\n"
plain_text << Iconv.conv('ASCII//IGNORE', 'UTF8', line)
end
self.plain_text = plain_text
#self.plain_text = Iconv.conv('ASCII//IGNORE', 'UTF8', file.read)
end
module Paperclip
# Handles extracting plain text from PDF file attachments
class Text < Processor
attr_accessor :whiny
# Creates a Text extract from PDF
def make
src = @file
dst = Tempfile.new([@basename, 'txt'].compact.join("."))
command = <<-end_command
"#{ File.expand_path(src.path) }"
"#{ File.expand_path(dst.path) }"
end_command
begin
success = Paperclip.run("/usr/bin/pdftotext -nopgbrk", command.gsub(/\s+/, " "))
Rails.logger.info "Processing #{src.path} to #{dst.path} in the text processor."
rescue PaperclipCommandLineError
raise PaperclipError, "There was an error processing the text for #{@basename}" if @whiny
end
dst
end
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.