Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
#!/usr/bin/env ruby
# coding: utf-8
# @todo Properly case the organization name.
# @todo Handle case where the organization has >1 contact but no email
# address on the first contact, e.g. CSSS DE CHICOUTIMI.
# @todo For organizations without any email address, HTML-ify their
# postal address and put it in the notes section.
# @todo Investigate why we get BAIEJAMES in some cases.
require 'csv'
require 'active_support/inflector'
if ARGV.include?('--clobber') || !File.exist?('CAI_liste_resp_acces.pdf')
`curl -O http://www.cai.gouv.qc.ca/documents/CAI_liste_resp_acces.pdf`
end
block = []
block_1 = nil
block_2 = nil
type = nil
ignore = /\A(Dernière mise à jour : \d{4}-\d{2}-\d{2} \d{2}:\d{2}|Page \d+|Répertoire des organismes assujettis et des responsables de l'accès aux documents des organismes publics et de la protection des renseignements personnels)\z/
$tags = {
"AGENCES DE LA SANTÉ" => ['santé'],
"AUTRES ORGANISMES GOUVERNEMENTAUX" => [],
"CÉGEPS" => [],
"CENTRE DE COMMUNICATIONS SANTÉ (911)" => [],
"CENTRE DE SANTÉ ET DE SERVICES SOCIAUX (CSSS)" => [],
"CENTRES D'HÉBERGEMENT ET DE RÉADAPTATION" => [],
"CENTRES HOSPITALIERS" => ['santé', 'hôpitaux'],
"CENTRES JEUNESSE" => ['jeunesse'],
"COMMISSIONS SCOLAIRES" => [],
"ÉTABLISSEMENTS PRIVÉS SUBVENTIONNÉS" => [],
"MINISTÈRES" => [],
"MUNICIPALITÉS" => [], "MUNICIPALITÉS RÉGIONALES DE COMTÉ (MRC)" => [],
"OFFICES MUNICIPAUX D'HABITATION" => [],
"ORDRES PROFESSIONNELS" => [],
"ORGANISMES MUNICIPAUX" => [],
"RÉGIES INTERMUNICIPALES" => [],
"UNIVERSITÉS" => [],
}
# Finds the element in the array that matches the regular expression, removes
# that element from the array, and returns the first capturing group matched by
# the regular expression.
#
# @param [Array] array an array
# @param [Regexp] regexp a regular expression
# @return [String] the first capturing group
def find_and_delete(array, regexp)
index = array.index{|x| x[regexp]}
index && array.delete_at(index)[regexp, 1]
end
organizations = []
`pdftotext CAI_liste_resp_acces.pdf -`.split("\n").each do |line|
line.strip!
# Collect a block, then parse it.
if line.empty?
text = block * ' '
unless text.empty? || text[ignore]
# The first block is the table of contents.
if block_1.nil?
block_1 = block
# The second block is the page numbers for the table of contents.
elsif block_2.nil?
block_2 = block
else
# The first line of a block is sometimes an item from the table of contents.
if block_1.include?(block.first)
type = block.first
block.shift
end
organization = {
organization: [],
name: nil,
role: [],
address: [],
voice: find_and_delete(block, /\ATél\. : ([\d# -]+)\z/),
fax: find_and_delete(block, /\ATéléc\. : ([\d# -]+)\z/),
tollfree: find_and_delete(block, /\ASans frais : ([\d -]+)\z/),
email: find_and_delete(block, /\A(\S+@\S+)\z/),
type: type,
}
block.each_with_index do |x,index|
# Ensure that organizations are read before names, and roles before
# addresses. Addresses swallow whatever is left.
if organization[:name].nil?
if x[/\A[\p{Lu}\p{N}\p{Punct}\p{Space}]+\z/]
organization[:organization] << x
else
organization[:name] = x
end
elsif organization[:address].empty?
if x[/\A\d|\bC\.P\. /]
organization[:address] << x
else
organization[:role] << x
end
else
organization[:address] << x
end
end
organization[:organization] *= ' '
organization[:role] *= ' '
organization[:address] *= ' '
organizations << organization
end
end
block = []
else
block << line
end
end
# Alaveteli does not support multiple contacts per public body.
safe = organizations.uniq do |x|
x[:organization]
end
puts "%4d organizations" % organizations.size
puts "%4d voice" % organizations.count{|x| x[:voice]}
puts "%4d fax" % organizations.count{|x| x[:fax]}
puts "%4d email" % organizations.count{|x| x[:email]}
puts "%4d safe" % safe.size
def output_to_csv(array, csv)
csv << ["#id", "name", "request_email", "tag_string"]
# Also supported: notes, publication_scheme, home_page
# Supported but undocumented: short_name, disclosure_log
id = 1
array.each do |organization|
# Skip organizations without email addresses for now.
# TODO: HTML-ify their postal address and put into "notes".
next unless organization[:email]
fail "no name for %s" % organization unless organization[:organization]
tag_string = $tags.fetch(organization[:type]).join(" ")
csv << [id, organization[:organization], organization[:email], tag_string]
id += 1
end
end
CSV.open('organizations.csv', 'w') do |csv|
output_to_csv(organizations, csv)
end
CSV.open('organizations-alaveteli-safe.csv', 'w') do |csv|
output_to_csv(safe, csv)
end
@scjody
Copy link
Author

scjody commented Jun 5, 2013

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment