Created
September 12, 2014 20:54
-
-
Save jatkins/c2189849cca74cb737d3 to your computer and use it in GitHub Desktop.
OCR
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/local/bin/ruby | |
require 'fileutils' | |
PATH="/mnt/data/working/old_format" | |
COLORS=["green", "yellow", "red", "blue", "turquoise1", "DarkBlue", "DeepPink", "pink", "purple", "crimson"] | |
sections = File.open("#{PATH}/../master_old.uzn").read.split("\n") | |
c = 0 | |
boxes = "" | |
sections.each do |section| | |
s = section.split(" ") | |
boxes += "-stroke #{COLORS[c]} -strokewidth 2 -draw \"rectangle #{s[0]},#{s[1]} #{s[0].to_i + s[2].to_i},#{s[1].to_i + s[3].to_i}\" \\\n" | |
c = c+1 | |
end | |
puts boxes | |
dirs = Dir["#{PATH}/*"].sort | |
dirs.each do |dir| | |
files = Dir["#{dir}/*.tif"].sort | |
puts "Running in #{dir}" | |
files.each do |file| | |
file_base = "#{File.basename(file, File.extname(file))}" | |
FileUtils.rm "#{dir}/temp.png" unless !File.exists?("#{dir}/temp.png") | |
`convert #{file} -fill none \ | |
-stroke green -strokewidth 2 -draw "rectangle 1250,600 1600,670" \ | |
-stroke yellow -strokewidth 2 -draw "rectangle 250,1270 600,1340" \ | |
-stroke red -strokewidth 2 -draw "rectangle 525,1270 875,1340" \ | |
-stroke blue -strokewidth 2 -draw "rectangle 800,1270 1350,1340" \ | |
-stroke turquoise1 -strokewidth 2 -draw "rectangle 250,1470 600,1540" \ | |
-stroke DarkBlue -strokewidth 2 -draw "rectangle 525,1470 875,1540" \ | |
-stroke DeepPink -strokewidth 2 -draw "rectangle 800,1470 1350,1540" \ | |
-stroke pink -strokewidth 2 -draw "rectangle 135,1090 915,1160" \ | |
-stroke purple -strokewidth 2 -draw "rectangle 200,2440 1570,2570" \ | |
-stroke crimson -strokewidth 2 -draw "rectangle 875,2555 1700,2695" #{dir}/#{file_base}.png` | |
# -stroke red -strokewidth 2 -draw "rectangle 865,500 1135,550" \ | |
# -stroke yellow -strokewidth 2 -draw "rectangle 880,560 1430,610" \ | |
# -stroke blue -strokewidth 2 -draw "rectangle 1330,560 1840,610" \ | |
# -stroke green -strokewidth 2 -draw "rectangle 1730,560 2240,610" #{dir}/#{file_base}.png` | |
end | |
FileUtils.mkdir "#{dir}/png" unless File.exists?("#{dir}/png") && File.directory?("#{dir}/png") | |
FileUtils.mv Dir.glob("#{dir}/*.png"), "#{dir}/png/" | |
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/opt/jruby/bin/jruby | |
require 'fileutils' | |
require 'peach' | |
PATH="/mnt/data/working/old_format" | |
M_UZN="/mnt/data/working/master_old.uzn" | |
COLORS=["green", "yellow", "red", "blue", "turquoise1", "DarkBlue", "DeepPink", "pink", "purple", "crimson", "gold"] | |
def generate_image(image) | |
base_command = "convert #{image} \-fill none \-strokewidth 2 " | |
dir = File.dirname(image) | |
file_name = File.basename(image, File.extname(image)) | |
boxes = File.open(M_UZN).read.split("\n") | |
c=0 | |
boxes.each do |box| | |
s = box.split(" ") | |
rx = s[0] | |
ry = s[1] | |
lx = s[0].to_i + s[2].to_i | |
ly = s[1].to_i + s[3].to_i | |
`convert #{image} -crop #{s[2]}x#{s[3]}+#{s[0]}+#{s[1]} +repage #{dir}/#{file_name}_#{s[4]}.png` | |
base_command += "\-stroke #{COLORS[c]} \-draw \"rectangle #{rx},#{ry} #{lx},#{ly}\" " | |
c=c+1 | |
end | |
base_command += " #{dir}/#{file_name}.png" | |
`#{base_command}` | |
end | |
dirs = Dir["#{PATH}/*"].sort | |
dirs.each do |dir| | |
files = Dir["#{dir}/*.tif"].sort | |
puts "Running in #{dir} with #{files.count} file(s)" | |
FileUtils.mkdir "#{dir}/ocr" unless File.exists?("#{dir}/ocr") && File.directory?("#{dir}/ocr") | |
files.each do |file| | |
file_name = "#{File.basename(file)}" | |
ocr_image_name = "#{File.basename(file_name, File.extname(file_name))}_ocr" | |
f_resting = "#{dir}/ocr/#{File.basename(file_name, File.extname(file_name))}" | |
FileUtils.mkdir(f_resting) unless File.exists?(f_resting) && File.directory?(f_resting) | |
FileUtils.cp M_UZN, "#{dir}/#{ocr_image_name}.uzn" | |
`convert #{file} -set filename:f "%t" -background black -fuzz 75% -deskew 50% -trim +repage #{dir}/#{ocr_image_name}.tif` | |
#`textcleaner -e none -f 10 -u -o 5 #{file} #{file_base}_ocr.tif` | |
`tesseract #{dir}/#{ocr_image_name}.tif #{dir}/#{ocr_image_name}_uzn -psm 4` | |
generate_image("#{dir}/#{ocr_image_name}.tif") | |
FileUtils.mv Dir.glob("#{dir}/*.txt"), "#{f_resting}" | |
FileUtils.mv Dir.glob("#{dir}/*.png"), "#{f_resting}" | |
FileUtils.rm "#{dir}/#{ocr_image_name}.tif" | |
end | |
FileUtils.rm Dir.glob("#{dir}/*.uzn") | |
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/local/bin/ruby | |
require 'fileutils' | |
#require 'peach' | |
PATH="/mnt/data/working/old_format" | |
M_UZN="/mnt/data/working/master_old.uzn" | |
COLORS=["green", "yellow", "red", "blue", "turquoise1", "DarkBlue", "DeepPink", "pink", "purple", "crimson", "gold"] | |
def generate_image(image) | |
base_command = "convert #{image} \-fill none \-strokewidth 2 " | |
dir = File.dirname(image) | |
file_name = File.basename(image, File.extname(image)) | |
boxes = File.open(M_UZN).read.split("\n") | |
c=0 | |
boxes.each do |box| | |
s = box.split(" ") | |
rx = s[0] | |
ry = s[1] | |
lx = s[0].to_i + s[2].to_i | |
ly = s[1].to_i + s[3].to_i | |
`convert #{image} -crop #{s[2]}x#{s[3]}+#{s[0]}+#{s[1]} +repage #{dir}/#{s[5]}-#{file_name}_#{s[4]}.ocr.png` | |
base_command += "\-stroke #{COLORS[c]} \-draw \"rectangle #{rx},#{ry} #{lx},#{ly}\" " | |
c=c+1 | |
end | |
base_command += " #{dir}/#{file_name}.png" | |
`#{base_command}` | |
end | |
dirs = Dir["#{PATH}/*"].sort | |
dirs.each do |dir| | |
files = Dir["#{dir}/*.tif"].sort | |
puts "Running in #{dir} with #{files.count} file(s)" | |
FileUtils.mkdir "#{dir}/ocr" unless File.exists?("#{dir}/ocr") && File.directory?("#{dir}/ocr") | |
files.each do |file| | |
file_name = "#{File.basename(file)}" | |
ocr_image_name = "#{File.basename(file_name, File.extname(file_name))}_ocr" | |
f_resting = "#{dir}/ocr/#{File.basename(file_name, File.extname(file_name))}" | |
FileUtils.mkdir(f_resting) unless File.exists?(f_resting) && File.directory?(f_resting) | |
#FileUtils.cp M_UZN, "#{dir}/#{ocr_image_name}.uzn" | |
`convert #{file} -set filename:f "%t" -background black -fuzz 75% -deskew 50% -trim +repage #{dir}/#{ocr_image_name}.tif` | |
#`textcleaner -e none -f 10 -u -o 5 #{file} #{file_base}_ocr.tif` | |
#`tesseract #{dir}/#{ocr_image_name}.tif #{dir}/#{ocr_image_name}_uzn -psm 4` | |
generate_image("#{dir}/#{ocr_image_name}.tif") | |
FileUtils.mv Dir.glob("#{dir}/*.txt"), "#{f_resting}" | |
FileUtils.mv Dir.glob("#{dir}/*.png"), "#{f_resting}" | |
FileUtils.rm "#{dir}/#{ocr_image_name}.tif" | |
ocr_files = Dir["#{f_resting}/*.ocr.png"] | |
ocr_files.each do |ocr_file| | |
#puts ocr_file | |
`tesseract #{ocr_file} #{f_resting}/#{File.basename(ocr_file, File.extname(ocr_file))}` | |
end | |
end | |
FileUtils.rm Dir.glob("#{dir}/*.uzn") | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment