tmcw/gist:200104

## gistfile1.rb
#!/usr/bin/env ruby
# coding: utf-8
require 'rubygems'
require 'fastercsv'
require 'pdf/reader'
require 'CSV'

class Array
  def to_csv
    str=''
    CSV::Writer.generate(str) do |csv|
      self.each do |r|
        csv << r
      end
    end
    str
  end
end


# Extract all text from a single PDF

class PageTextReceiver
  attr_accessor :content

  def initialize
    @content = []
  end

  # Called when page parsing starts
  def begin_page(arg = nil)
    puts "reading a page"
    @content << ""
  end

  def is_weird_blank(string)
    string.scan(/\w+/).empty?
  end

  # record text that is drawn on the page
  # avoid the odd UTF-8 character sequence used for polling stations
  def show_text(string, *params)
    if not is_weird_blank(string) then
      @content.last << string.strip + " "
    end
  end

  # there's a few text callbacks, so make sure we process them all
  alias :super_show_text :show_text
  alias :move_to_next_line_and_show_text :show_text
  alias :set_spacing_next_line_show_text :show_text

  # this final text callback takes slightly different arguments
  def show_text_with_positioning(*params)
    params = params.first
    params.each { |str| show_text(str) if str.kind_of?(String)}
  end
end

def pager(string)
  # split document by footer
  sp= string.split('Preliminary result, approved by the Independent Election Commission but subject to challenge in accordance with the law prior t o final certification.')
  sp.map{ |s|
    # remove this tag from each page
    s.sub('Independent Election Commission', '').strip
  }.join(' ')
end

def totals(tables)
  stations = {}
  tables.scan(/(\d+) Total ([\d+\s]*)/).each{ |match|
    station_id = match[0]
    # The string of numbers is polling stations, grand total, then next polling station
    vote_counts = match[1].split(' ')
    # since vote_counts[-1] is the next polling station
    grand_total = vote_counts[-2]
    stations[station_id] = grand_total
  }
  stations
end

def candidates(tables)
  report = {}
  polling_station = 0
  cand = tables.scan(/([A-Za-z\. ]{2,}) ([\d\s]*)/)
  cand.each { |candidate|
    candidate[1] = candidate[1].strip().split(' ')
  }
  cand.each_index { |i|
    # TODO: compensate for all place names
    if cand[i][0] == 'Kabul'
      cand.delete_at i
    end
    if cand[i][0] == 'Total'
      polling_station = cand[i][1].pop()
      cand.delete_at i
      cand[i - 1][1].pop()
    end
    cand[i].push(polling_station)
  }
  cand
end

def candidate_totals(candidates)
  candidates.each { |candidate|
    candidate[1] = candidate[1].last
  }
  candidates
end

receiver = PageTextReceiver.new
pdf = PDF::Reader.file("500a.pdf", receiver)
csv_out = FasterCSV.open('out.csv', 'w')

candidate_totals(candidates(pager(receiver.content.to_s))).each { |row|
  csv_out << row
}
	#!/usr/bin/env ruby
	# coding: utf-8
	require 'rubygems'
	require 'fastercsv'
	require 'pdf/reader'
	require 'CSV'

	class Array
	def to_csv
	str=''
	CSV::Writer.generate(str) do \|csv\|
	self.each do \|r\|
	csv << r
	end
	end
	str
	end
	end


	# Extract all text from a single PDF

	class PageTextReceiver
	attr_accessor :content

	def initialize
	@content = []
	end

	# Called when page parsing starts
	def begin_page(arg = nil)
	puts "reading a page"
	@content << ""
	end

	def is_weird_blank(string)
	string.scan(/\w+/).empty?
	end

	# record text that is drawn on the page
	# avoid the odd UTF-8 character sequence used for polling stations
	def show_text(string, *params)
	if not is_weird_blank(string) then
	@content.last << string.strip + " "
	end
	end

	# there's a few text callbacks, so make sure we process them all
	alias :super_show_text :show_text
	alias :move_to_next_line_and_show_text :show_text
	alias :set_spacing_next_line_show_text :show_text

	# this final text callback takes slightly different arguments
	def show_text_with_positioning(*params)
	params = params.first
	params.each { \|str\| show_text(str) if str.kind_of?(String)}
	end
	end

	def pager(string)
	# split document by footer
	sp= string.split('Preliminary result, approved by the Independent Election Commission but subject to challenge in accordance with the law prior t o final certification.')
	sp.map{ \|s\|
	# remove this tag from each page
	s.sub('Independent Election Commission', '').strip
	}.join(' ')
	end

	def totals(tables)
	stations = {}
	tables.scan(/(\d+) Total ([\d+\s]*)/).each{ \|match\|
	station_id = match[0]
	# The string of numbers is polling stations, grand total, then next polling station
	vote_counts = match[1].split(' ')
	# since vote_counts[-1] is the next polling station
	grand_total = vote_counts[-2]
	stations[station_id] = grand_total
	}
	stations
	end

	def candidates(tables)
	report = {}
	polling_station = 0
	cand = tables.scan(/([A-Za-z\. ]{2,}) ([\d\s]*)/)
	cand.each { \|candidate\|
	candidate[1] = candidate[1].strip().split(' ')
	}
	cand.each_index { \|i\|
	# TODO: compensate for all place names
	if cand[i][0] == 'Kabul'
	cand.delete_at i
	end
	if cand[i][0] == 'Total'
	polling_station = cand[i][1].pop()
	cand.delete_at i
	cand[i - 1][1].pop()
	end
	cand[i].push(polling_station)
	}
	cand
	end

	def candidate_totals(candidates)
	candidates.each { \|candidate\|
	candidate[1] = candidate[1].last
	}
	candidates
	end

	receiver = PageTextReceiver.new
	pdf = PDF::Reader.file("500a.pdf", receiver)
	csv_out = FasterCSV.open('out.csv', 'w')

	candidate_totals(candidates(pager(receiver.content.to_s))).each { \|row\|
	csv_out << row
	}