bcardiff/README.md

## README.md

      
    Raw
  

              README.md
            
          
    $ brew install imagemagick
$ brew install tesseract
$ ls -l1
Basic Infrastructure.pdf
extract.rb

$ ruby extract.rb 2> /dev/null
extracting page 1
.....................
extracting page 2
.....

$ ls -l1
Basic Infrastructure.pdf
extract.rb
data.csv

Output will be on data.csv.
The script extract a image for each page (extract_page).
For each row it will extract a image with the id value using convert from imagemagick. And it will use tesseract ocr to get the actual value. extract_id.
For extracting the checkbox value, a subimage of the checkbox is generated and the average value of all colors is used. An unchecked checkbox will be mostly white. And a checked will have a bit of black so the average will decreate. This is done in extract_chk.
get_id_crop_coordinates and get_check_crop_coordinates gives the cropping coordinates for the id value and the checkbox. This values differs if the page is the first one or not.

  
## extract.rb
$output = File.open('data.csv','w')
$output.sync = true

def extract_page(index)
  puts "\nextracting page #{index}"
  $page_index = index
  `convert -density 300 Basic\\ Infrastructure.pdf[#{index - 1}] page.png`
end

def extract_row(index)
  print "."
  $row_index = index
  # `convert page.png -crop #{get_check_row_coordinates} checkboxes-#{$row_index}.png`
  $output.puts "#{extract_id},#{extract_chk(1)},#{extract_chk(2)},#{extract_chk(3)},#{extract_chk(4)},#{extract_chk(5)},#{extract_chk(6)},#{extract_chk(7)}"
end

def extract_id
  `convert page.png -crop #{get_id_crop_coordinates} number.png`
  `tesseract number.png stdout`.strip
end

def get_id_crop_coordinates
  if $page_index == 1
    start_y = 421
  else
    start_y = 360
  end
  row_height = 65.6
  "165x60+303+#{(start_y + row_height * ($row_index-1)).to_i}"
end

def extract_chk(index)
  # `convert page.png -crop #{get_check_crop_coordinates(index)} checkboxes-#{$row_index}-#{index}.png`
  value = `convert page.png -crop #{get_check_crop_coordinates(index)} -type Grayscale -format "%[mean]\n" info:`
  value.strip.to_f < 53000
end

def get_check_crop_coordinates(index)
  start_x = 1071
  cell_width = 176
  if $page_index == 1
    start_y = 435
  else
    start_y = 374
  end
  row_height = 65.5
  "33x33+#{start_x+cell_width*(index-1)}+#{(start_y + row_height * ($row_index-1)).to_i}"
end

def get_check_row_coordinates
  start_x = 1071
  if $page_index == 1
    start_y = 435
  else
    start_y = 374
  end
  row_height = 65.5
  "1250x33+#{start_x}+#{(start_y + row_height * ($row_index-1)).to_i}"
end

extract_page 1
(1..39).each do |i|
  extract_row i
end

(2..140).each do |page|
  extract_page page
  (1..40).each do |i|
    extract_row i
  end
end

extract_page 141
(1..12).each do |i|
  extract_row i
end

$output.close
	$output = File.open('data.csv','w')
	$output.sync = true

	def extract_page(index)
	puts "\nextracting page #{index}"
	$page_index = index
	`convert -density 300 Basic\\ Infrastructure.pdf[#{index - 1}] page.png`
	end

	def extract_row(index)
	print "."
	$row_index = index
	# `convert page.png -crop #{get_check_row_coordinates} checkboxes-#{$row_index}.png`
	$output.puts "#{extract_id},#{extract_chk(1)},#{extract_chk(2)},#{extract_chk(3)},#{extract_chk(4)},#{extract_chk(5)},#{extract_chk(6)},#{extract_chk(7)}"
	end

	def extract_id
	`convert page.png -crop #{get_id_crop_coordinates} number.png`
	`tesseract number.png stdout`.strip
	end

	def get_id_crop_coordinates
	if $page_index == 1
	start_y = 421
	else
	start_y = 360
	end
	row_height = 65.6
	"165x60+303+#{(start_y + row_height * ($row_index-1)).to_i}"
	end

	def extract_chk(index)
	# `convert page.png -crop #{get_check_crop_coordinates(index)} checkboxes-#{$row_index}-#{index}.png`
	value = `convert page.png -crop #{get_check_crop_coordinates(index)} -type Grayscale -format "%[mean]\n" info:`
	value.strip.to_f < 53000
	end

	def get_check_crop_coordinates(index)
	start_x = 1071
	cell_width = 176
	if $page_index == 1
	start_y = 435
	else
	start_y = 374
	end
	row_height = 65.5
	"33x33+#{start_x+cell_width(index-1)}+#{(start_y + row_height ($row_index-1)).to_i}"
	end

	def get_check_row_coordinates
	start_x = 1071
	if $page_index == 1
	start_y = 435
	else
	start_y = 374
	end
	row_height = 65.5
	"1250x33+#{start_x}+#{(start_y + row_height * ($row_index-1)).to_i}"
	end

	extract_page 1
	(1..39).each do \|i\|
	extract_row i
	end

	(2..140).each do \|page\|
	extract_page page
	(1..40).each do \|i\|
	extract_row i
	end
	end

	extract_page 141
	(1..12).each do \|i\|
	extract_row i
	end

	$output.close