Skip to content

Instantly share code, notes, and snippets.

@jatkins
Created September 12, 2014 20:54
Show Gist options
  • Save jatkins/c2189849cca74cb737d3 to your computer and use it in GitHub Desktop.
Save jatkins/c2189849cca74cb737d3 to your computer and use it in GitHub Desktop.
OCR
#!/usr/local/bin/ruby
require 'fileutils'
PATH="/mnt/data/working/old_format"
COLORS=["green", "yellow", "red", "blue", "turquoise1", "DarkBlue", "DeepPink", "pink", "purple", "crimson"]
sections = File.open("#{PATH}/../master_old.uzn").read.split("\n")
c = 0
boxes = ""
sections.each do |section|
s = section.split(" ")
boxes += "-stroke #{COLORS[c]} -strokewidth 2 -draw \"rectangle #{s[0]},#{s[1]} #{s[0].to_i + s[2].to_i},#{s[1].to_i + s[3].to_i}\" \\\n"
c = c+1
end
puts boxes
dirs = Dir["#{PATH}/*"].sort
dirs.each do |dir|
files = Dir["#{dir}/*.tif"].sort
puts "Running in #{dir}"
files.each do |file|
file_base = "#{File.basename(file, File.extname(file))}"
FileUtils.rm "#{dir}/temp.png" unless !File.exists?("#{dir}/temp.png")
`convert #{file} -fill none \
-stroke green -strokewidth 2 -draw "rectangle 1250,600 1600,670" \
-stroke yellow -strokewidth 2 -draw "rectangle 250,1270 600,1340" \
-stroke red -strokewidth 2 -draw "rectangle 525,1270 875,1340" \
-stroke blue -strokewidth 2 -draw "rectangle 800,1270 1350,1340" \
-stroke turquoise1 -strokewidth 2 -draw "rectangle 250,1470 600,1540" \
-stroke DarkBlue -strokewidth 2 -draw "rectangle 525,1470 875,1540" \
-stroke DeepPink -strokewidth 2 -draw "rectangle 800,1470 1350,1540" \
-stroke pink -strokewidth 2 -draw "rectangle 135,1090 915,1160" \
-stroke purple -strokewidth 2 -draw "rectangle 200,2440 1570,2570" \
-stroke crimson -strokewidth 2 -draw "rectangle 875,2555 1700,2695" #{dir}/#{file_base}.png`
# -stroke red -strokewidth 2 -draw "rectangle 865,500 1135,550" \
# -stroke yellow -strokewidth 2 -draw "rectangle 880,560 1430,610" \
# -stroke blue -strokewidth 2 -draw "rectangle 1330,560 1840,610" \
# -stroke green -strokewidth 2 -draw "rectangle 1730,560 2240,610" #{dir}/#{file_base}.png`
end
FileUtils.mkdir "#{dir}/png" unless File.exists?("#{dir}/png") && File.directory?("#{dir}/png")
FileUtils.mv Dir.glob("#{dir}/*.png"), "#{dir}/png/"
end
#!/opt/jruby/bin/jruby
require 'fileutils'
require 'peach'
PATH="/mnt/data/working/old_format"
M_UZN="/mnt/data/working/master_old.uzn"
COLORS=["green", "yellow", "red", "blue", "turquoise1", "DarkBlue", "DeepPink", "pink", "purple", "crimson", "gold"]
def generate_image(image)
base_command = "convert #{image} \-fill none \-strokewidth 2 "
dir = File.dirname(image)
file_name = File.basename(image, File.extname(image))
boxes = File.open(M_UZN).read.split("\n")
c=0
boxes.each do |box|
s = box.split(" ")
rx = s[0]
ry = s[1]
lx = s[0].to_i + s[2].to_i
ly = s[1].to_i + s[3].to_i
`convert #{image} -crop #{s[2]}x#{s[3]}+#{s[0]}+#{s[1]} +repage #{dir}/#{file_name}_#{s[4]}.png`
base_command += "\-stroke #{COLORS[c]} \-draw \"rectangle #{rx},#{ry} #{lx},#{ly}\" "
c=c+1
end
base_command += " #{dir}/#{file_name}.png"
`#{base_command}`
end
dirs = Dir["#{PATH}/*"].sort
dirs.each do |dir|
files = Dir["#{dir}/*.tif"].sort
puts "Running in #{dir} with #{files.count} file(s)"
FileUtils.mkdir "#{dir}/ocr" unless File.exists?("#{dir}/ocr") && File.directory?("#{dir}/ocr")
files.each do |file|
file_name = "#{File.basename(file)}"
ocr_image_name = "#{File.basename(file_name, File.extname(file_name))}_ocr"
f_resting = "#{dir}/ocr/#{File.basename(file_name, File.extname(file_name))}"
FileUtils.mkdir(f_resting) unless File.exists?(f_resting) && File.directory?(f_resting)
FileUtils.cp M_UZN, "#{dir}/#{ocr_image_name}.uzn"
`convert #{file} -set filename:f "%t" -background black -fuzz 75% -deskew 50% -trim +repage #{dir}/#{ocr_image_name}.tif`
#`textcleaner -e none -f 10 -u -o 5 #{file} #{file_base}_ocr.tif`
`tesseract #{dir}/#{ocr_image_name}.tif #{dir}/#{ocr_image_name}_uzn -psm 4`
generate_image("#{dir}/#{ocr_image_name}.tif")
FileUtils.mv Dir.glob("#{dir}/*.txt"), "#{f_resting}"
FileUtils.mv Dir.glob("#{dir}/*.png"), "#{f_resting}"
FileUtils.rm "#{dir}/#{ocr_image_name}.tif"
end
FileUtils.rm Dir.glob("#{dir}/*.uzn")
end
#!/usr/local/bin/ruby
require 'fileutils'
#require 'peach'
PATH="/mnt/data/working/old_format"
M_UZN="/mnt/data/working/master_old.uzn"
COLORS=["green", "yellow", "red", "blue", "turquoise1", "DarkBlue", "DeepPink", "pink", "purple", "crimson", "gold"]
def generate_image(image)
base_command = "convert #{image} \-fill none \-strokewidth 2 "
dir = File.dirname(image)
file_name = File.basename(image, File.extname(image))
boxes = File.open(M_UZN).read.split("\n")
c=0
boxes.each do |box|
s = box.split(" ")
rx = s[0]
ry = s[1]
lx = s[0].to_i + s[2].to_i
ly = s[1].to_i + s[3].to_i
`convert #{image} -crop #{s[2]}x#{s[3]}+#{s[0]}+#{s[1]} +repage #{dir}/#{s[5]}-#{file_name}_#{s[4]}.ocr.png`
base_command += "\-stroke #{COLORS[c]} \-draw \"rectangle #{rx},#{ry} #{lx},#{ly}\" "
c=c+1
end
base_command += " #{dir}/#{file_name}.png"
`#{base_command}`
end
dirs = Dir["#{PATH}/*"].sort
dirs.each do |dir|
files = Dir["#{dir}/*.tif"].sort
puts "Running in #{dir} with #{files.count} file(s)"
FileUtils.mkdir "#{dir}/ocr" unless File.exists?("#{dir}/ocr") && File.directory?("#{dir}/ocr")
files.each do |file|
file_name = "#{File.basename(file)}"
ocr_image_name = "#{File.basename(file_name, File.extname(file_name))}_ocr"
f_resting = "#{dir}/ocr/#{File.basename(file_name, File.extname(file_name))}"
FileUtils.mkdir(f_resting) unless File.exists?(f_resting) && File.directory?(f_resting)
#FileUtils.cp M_UZN, "#{dir}/#{ocr_image_name}.uzn"
`convert #{file} -set filename:f "%t" -background black -fuzz 75% -deskew 50% -trim +repage #{dir}/#{ocr_image_name}.tif`
#`textcleaner -e none -f 10 -u -o 5 #{file} #{file_base}_ocr.tif`
#`tesseract #{dir}/#{ocr_image_name}.tif #{dir}/#{ocr_image_name}_uzn -psm 4`
generate_image("#{dir}/#{ocr_image_name}.tif")
FileUtils.mv Dir.glob("#{dir}/*.txt"), "#{f_resting}"
FileUtils.mv Dir.glob("#{dir}/*.png"), "#{f_resting}"
FileUtils.rm "#{dir}/#{ocr_image_name}.tif"
ocr_files = Dir["#{f_resting}/*.ocr.png"]
ocr_files.each do |ocr_file|
#puts ocr_file
`tesseract #{ocr_file} #{f_resting}/#{File.basename(ocr_file, File.extname(ocr_file))}`
end
end
FileUtils.rm Dir.glob("#{dir}/*.uzn")
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment