Created
September 23, 2019 19:45
-
-
Save ryanfb/5ed1a16c1016980dd2b2effa0a747778 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
Encoding.default_external = Encoding::UTF_8 | |
Encoding.default_internal = Encoding::UTF_8 | |
require 'nokogiri' | |
pdf_filename = ARGV[0] | |
xml_filename = File.basename(pdf_filename,'.pdf') + '.xml' | |
$stderr.puts "Converting to XML..." | |
`pdftohtml -c -zoom 3 -xml #{pdf_filename} #{xml_filename}` | |
doc = File.open(xml_filename) { |f| Nokogiri::XML(f) } | |
width = doc.xpath('//page').first.attr('width').to_i | |
height = doc.xpath('//page').first.attr('height').to_i | |
$stderr.puts "Converting to #{width}x#{height} PNG files..." | |
`mkdir -p pdfimages` | |
`convert -density 300 #{pdf_filename} -type Grayscale -background white +matte -resize #{width}x#{height} -depth 8 pdfimages/page_%05d.png` | |
$stderr.puts "Cropping into groundtruth..." | |
`mkdir -p groundtruth` | |
i = 0 | |
j = 0 | |
doc.xpath('//page').each do |page| | |
page_filename = sprintf("pdfimages/page_%05d.png", i) | |
$stderr.puts page_filename | |
page.children.select{|node| (node.name == 'text') && (!node.content.strip.empty?) && (!node.content.include?('Copyright'))}.each do |text| | |
# width x height + left + top | |
png_filename = sprintf("groundtruth/%06d.png", j) | |
txt_filename = sprintf("groundtruth/%06d.gt.txt", j) | |
`convert #{page_filename} -crop #{text['width']}x#{text['height']}+#{text['left']}+#{text['top']} +repage #{png_filename}` | |
File.open(txt_filename, 'w') {|file| file.write(text.content)} | |
$stderr.puts "#{png_filename}: #{text.content}" | |
j += 1 | |
end | |
i += 1 | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment