Skip to content

Instantly share code, notes, and snippets.

@tmcw
Created October 2, 2009 20:53
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save tmcw/200104 to your computer and use it in GitHub Desktop.
Save tmcw/200104 to your computer and use it in GitHub Desktop.
#!/usr/bin/env ruby
# coding: utf-8
require 'rubygems'
require 'fastercsv'
require 'pdf/reader'
require 'CSV'
class Array
def to_csv
str=''
CSV::Writer.generate(str) do |csv|
self.each do |r|
csv << r
end
end
str
end
end
# Extract all text from a single PDF
class PageTextReceiver
attr_accessor :content
def initialize
@content = []
end
# Called when page parsing starts
def begin_page(arg = nil)
puts "reading a page"
@content << ""
end
def is_weird_blank(string)
string.scan(/\w+/).empty?
end
# record text that is drawn on the page
# avoid the odd UTF-8 character sequence used for polling stations
def show_text(string, *params)
if not is_weird_blank(string) then
@content.last << string.strip + " "
end
end
# there's a few text callbacks, so make sure we process them all
alias :super_show_text :show_text
alias :move_to_next_line_and_show_text :show_text
alias :set_spacing_next_line_show_text :show_text
# this final text callback takes slightly different arguments
def show_text_with_positioning(*params)
params = params.first
params.each { |str| show_text(str) if str.kind_of?(String)}
end
end
def pager(string)
# split document by footer
sp= string.split('Preliminary result, approved by the Independent Election Commission but subject to challenge in accordance with the law prior t o final certification.')
sp.map{ |s|
# remove this tag from each page
s.sub('Independent Election Commission', '').strip
}.join(' ')
end
def totals(tables)
stations = {}
tables.scan(/(\d+) Total ([\d+\s]*)/).each{ |match|
station_id = match[0]
# The string of numbers is polling stations, grand total, then next polling station
vote_counts = match[1].split(' ')
# since vote_counts[-1] is the next polling station
grand_total = vote_counts[-2]
stations[station_id] = grand_total
}
stations
end
def candidates(tables)
report = {}
polling_station = 0
cand = tables.scan(/([A-Za-z\. ]{2,}) ([\d\s]*)/)
cand.each { |candidate|
candidate[1] = candidate[1].strip().split(' ')
}
cand.each_index { |i|
# TODO: compensate for all place names
if cand[i][0] == 'Kabul'
cand.delete_at i
end
if cand[i][0] == 'Total'
polling_station = cand[i][1].pop()
cand.delete_at i
cand[i - 1][1].pop()
end
cand[i].push(polling_station)
}
cand
end
def candidate_totals(candidates)
candidates.each { |candidate|
candidate[1] = candidate[1].last
}
candidates
end
receiver = PageTextReceiver.new
pdf = PDF::Reader.file("500a.pdf", receiver)
csv_out = FasterCSV.open('out.csv', 'w')
candidate_totals(candidates(pager(receiver.content.to_s))).each { |row|
csv_out << row
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment