torreypayne/ruby_ocr.rb

## ruby_ocr.rb
require 'parallel'
require 'rtesseract'
require 'mini_magick'

source = "/MyDirectory/my.pdf"
doc = {}
pdf = MiniMagick::Image.open(source)
Parallel.map(pdf.pages.each_with_index, in_threads: 8) do |page, idx|
  tmpfile = Tempfile.new(['', '.tif'])
  MiniMagick::Tool::Convert.new do |convert|
    convert.density(300)
    convert << page.path
    convert.alpha("off")
    convert << tmpfile.path
  end
  tess = RTesseract.new(tmpfile.path)
  doc[idx] = tess.to_s
  tmpfile.unlink
end
doc.sort.to_h.values
	require 'parallel'
	require 'rtesseract'
	require 'mini_magick'

	source = "/MyDirectory/my.pdf"
	doc = {}
	pdf = MiniMagick::Image.open(source)
	Parallel.map(pdf.pages.each_with_index, in_threads: 8) do \|page, idx\|
	tmpfile = Tempfile.new(['', '.tif'])
	MiniMagick::Tool::Convert.new do \|convert\|
	convert.density(300)
	convert << page.path
	convert.alpha("off")
	convert << tmpfile.path
	end
	tess = RTesseract.new(tmpfile.path)
	doc[idx] = tess.to_s
	tmpfile.unlink
	end
	doc.sort.to_h.values