A toy command line utility for OCRing and cleaning OCR output.
#!/usr/bin/env ruby | |
# tesse: commandline tool for looking at tesseract OCR and cleaning the output | |
# Besides the following gem requirements it requires the following Linux programs: | |
# eog: for viewing the images | |
# wmctrl: for resizing and positioning the image viewing window | |
require 'tesseract' | |
require 'ffi/aspell' | |
require 'tempfile' | |
require 'formatador' | |
file = File.expand_path(ARGV[0]) | |
system('clear') | |
engine = Tesseract::Engine.new {|e| | |
e.language = :eng | |
e.blacklist = '|' | |
e.whitelist = [*'a'..'z', *'A'..'Z', *0..9, " ."].join | |
} | |
Speller = FFI::Aspell::Speller.new('en_US') | |
def spelling_percentage(text) | |
words = text.split(' ') | |
total = words.length | |
correct = 0.0 | |
words.each do |word| | |
if Speller.correct?(word) | |
correct += 1 | |
end | |
end | |
correct / total * 100.0 | |
end | |
cleaned_ocr = [] | |
engine.each_line_for(file) do |line| | |
text = line.text | |
puts text | |
Formatador.display "[green]\n-----------------------------------------------\n[/]" | |
table = {} | |
table[:confidence] = line.confidence | |
table[:spelling] = spelling_percentage(text) | |
Formatador.display_table([table]) | |
# write the line image to a file | |
file = Tempfile.new('tesse_image') | |
# puts file.path | |
file.write line.image.to_blob | |
# show the image in eog and position the windows nicely | |
eog_pid = fork do | |
exec %Q|eog #{file.path} 2>/dev/null| | |
end | |
system "sleep 1; wmctrl -a tesse; wmctrl -r tesse_image -e 0,0,700,1500,300;" | |
print "Continue? " | |
value = STDIN.gets.chomp | |
case value | |
when 'e' | |
puts "Type out a better transcript of this line:" | |
text = STDIN.gets.chomp | |
when 'd' | |
text = nil | |
end | |
cleaned_ocr << text | |
# clean up by killing the image viewer process | |
Process.kill 'ABRT', eog_pid | |
`killall eog` | |
system('clear') | |
file.close | |
file.unlink | |
end | |
puts 'Done! Following is the cleaned output' | |
Formatador.display "[green]\n-----------------------------------------------\n[/]" | |
cleaned_ocr.compact! | |
puts cleaned_ocr |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment