Created
October 2, 2009 20:53
-
-
Save tmcw/200104 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
# coding: utf-8 | |
require 'rubygems' | |
require 'fastercsv' | |
require 'pdf/reader' | |
require 'CSV' | |
class Array | |
def to_csv | |
str='' | |
CSV::Writer.generate(str) do |csv| | |
self.each do |r| | |
csv << r | |
end | |
end | |
str | |
end | |
end | |
# Extract all text from a single PDF | |
class PageTextReceiver | |
attr_accessor :content | |
def initialize | |
@content = [] | |
end | |
# Called when page parsing starts | |
def begin_page(arg = nil) | |
puts "reading a page" | |
@content << "" | |
end | |
def is_weird_blank(string) | |
string.scan(/\w+/).empty? | |
end | |
# record text that is drawn on the page | |
# avoid the odd UTF-8 character sequence used for polling stations | |
def show_text(string, *params) | |
if not is_weird_blank(string) then | |
@content.last << string.strip + " " | |
end | |
end | |
# there's a few text callbacks, so make sure we process them all | |
alias :super_show_text :show_text | |
alias :move_to_next_line_and_show_text :show_text | |
alias :set_spacing_next_line_show_text :show_text | |
# this final text callback takes slightly different arguments | |
def show_text_with_positioning(*params) | |
params = params.first | |
params.each { |str| show_text(str) if str.kind_of?(String)} | |
end | |
end | |
def pager(string) | |
# split document by footer | |
sp= string.split('Preliminary result, approved by the Independent Election Commission but subject to challenge in accordance with the law prior t o final certification.') | |
sp.map{ |s| | |
# remove this tag from each page | |
s.sub('Independent Election Commission', '').strip | |
}.join(' ') | |
end | |
def totals(tables) | |
stations = {} | |
tables.scan(/(\d+) Total ([\d+\s]*)/).each{ |match| | |
station_id = match[0] | |
# The string of numbers is polling stations, grand total, then next polling station | |
vote_counts = match[1].split(' ') | |
# since vote_counts[-1] is the next polling station | |
grand_total = vote_counts[-2] | |
stations[station_id] = grand_total | |
} | |
stations | |
end | |
def candidates(tables) | |
report = {} | |
polling_station = 0 | |
cand = tables.scan(/([A-Za-z\. ]{2,}) ([\d\s]*)/) | |
cand.each { |candidate| | |
candidate[1] = candidate[1].strip().split(' ') | |
} | |
cand.each_index { |i| | |
# TODO: compensate for all place names | |
if cand[i][0] == 'Kabul' | |
cand.delete_at i | |
end | |
if cand[i][0] == 'Total' | |
polling_station = cand[i][1].pop() | |
cand.delete_at i | |
cand[i - 1][1].pop() | |
end | |
cand[i].push(polling_station) | |
} | |
cand | |
end | |
def candidate_totals(candidates) | |
candidates.each { |candidate| | |
candidate[1] = candidate[1].last | |
} | |
candidates | |
end | |
receiver = PageTextReceiver.new | |
pdf = PDF::Reader.file("500a.pdf", receiver) | |
csv_out = FasterCSV.open('out.csv', 'w') | |
candidate_totals(candidates(pager(receiver.content.to_s))).each { |row| | |
csv_out << row | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment