walterdavis/document.rb

## document.rb
  def extract_text
    file = File.open("#{pdf.queued_for_write[:text].path}","r")
    plain_text = ""
    while (line = file.gets)
      #  plain_text << line.gsub(/[^\x00-\x7F]/n,'').gsub(/`/,"'").gsub(/^\d+$/,'').to_s + "\n"
      plain_text << Iconv.conv('ASCII//IGNORE', 'UTF8', line)
    end
    self.plain_text = plain_text
    #self.plain_text = Iconv.conv('ASCII//IGNORE', 'UTF8', file.read)
  end

## text.rb
module Paperclip
  # Handles extracting plain text from PDF file attachments
  class Text < Processor

    attr_accessor :whiny

    # Creates a Text extract from PDF
    def make
      src = @file
      dst = Tempfile.new([@basename, 'txt'].compact.join("."))
      command = <<-end_command
        "#{ File.expand_path(src.path) }"
        "#{ File.expand_path(dst.path) }"
      end_command

      begin
        success = Paperclip.run("/usr/bin/pdftotext -nopgbrk", command.gsub(/\s+/, " "))
        Rails.logger.info "Processing #{src.path} to #{dst.path} in the text processor."
      rescue PaperclipCommandLineError
        raise PaperclipError, "There was an error processing the text for #{@basename}" if @whiny
      end
      dst
    end
  end
end
	def extract_text
	file = File.open("#{pdf.queued_for_write[:text].path}","r")
	plain_text = ""
	while (line = file.gets)
	# plain_text << line.gsub(/[^\x00-\x7F]/n,'').gsub(/`/,"'").gsub(/^\d+$/,'').to_s + "\n"
	plain_text << Iconv.conv('ASCII//IGNORE', 'UTF8', line)
	end
	self.plain_text = plain_text
	#self.plain_text = Iconv.conv('ASCII//IGNORE', 'UTF8', file.read)
	end
	module Paperclip
	# Handles extracting plain text from PDF file attachments
	class Text < Processor

	attr_accessor :whiny

	# Creates a Text extract from PDF
	def make
	src = @file
	dst = Tempfile.new([@basename, 'txt'].compact.join("."))
	command = <<-end_command
	"#{ File.expand_path(src.path) }"
	"#{ File.expand_path(dst.path) }"
	end_command

	begin
	success = Paperclip.run("/usr/bin/pdftotext -nopgbrk", command.gsub(/\s+/, " "))
	Rails.logger.info "Processing #{src.path} to #{dst.path} in the text processor."
	rescue PaperclipCommandLineError
	raise PaperclipError, "There was an error processing the text for #{@basename}" if @whiny
	end
	dst
	end
	end
	end