Skip to content

Instantly share code, notes, and snippets.

Created December 3, 2010 21:44
Show Gist options
  • Save anonymous/727607 to your computer and use it in GitHub Desktop.
Save anonymous/727607 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
# coding: utf-8
# Copyright (c) 2010, Renzo Carbonara <gnuk0001@gmail.com>
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
#
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
#
# * Neither the name of the Renzo Carbonara nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
# ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
import re
import sys
import logging
import unicodedata
from datetime import datetime
import slate
logging.basicConfig(level=logging.INFO)
log = logging.getLogger(__file__)
VOTE_RESULTS = {
'AFIRMATIVO' : 'Y',
'NEGATIVO' : 'N',
'AUSENTE' : 'A',
'ABSTENCION' : 'X' }
_RE_TOOLS = {
'uni_AZ': ur"A-ZÑÀÁÄÂÉÈËÊÍÌÏÎÓÒÖÔÚÙÜÛ",
'uni_az': ur"a-zñàáäâéèëêíìïîóòöôúùüû" }
def un(text):
"""
Downgrade shinny Unicode text to ASCII-compatible unicode text.
Example: u'Canción' is downgraded to u'Cancion'.
"""
return unicodedata.normalize('NFD', text).encode('ascii', 'ignore').decode('ascii')
class VotacionNominal2010PDFParser(object):
_RX_SPLIT = re.compile(r'(AFIRMATIVO|NEGATIVO|AUSENTE|ABSTENCION)')
# used to extract nombre, apellido, partido, and distrito from a string like:
# 'BLANCO de PERALTA, BlancaFrente para la Victoria - PJSanta Cruz'
_RX_WHO = re.compile(ur"""
# nombre (whatever antes de la primer coma)
^(.*?),\ *
# apellido (termina con minúscula preciediendo una mayuscula, o parentesis ('(con licensia)'))
(.*?[%(uni_az)s\)])
# partido (comienza con mayuscula)
([%(uni_AZ)s].*?
# terminacion:
(?:
# algunos terminan con minuscula o punto
[%(uni_az)s\)]
# otros terminan en mayuscula[s], (el lookahead acá es el mismo que [AAA])
|[%(uni_AZ)s]+(?=[%(uni_AZ)s])
)
# hasta lookahedear una mayúscula <--- [AAA]
(?=[%(uni_AZ)s]))
# distrito (comienza con mayuscula, todo lo restante)
(.*)$
""" % _RE_TOOLS, re.X)
# header part 1: Up to (not inclunding) 'Reunion'
_RX_HEADER_1 = re.compile(ur"""
^pagina\W*
(?P<pagina>\d+).*?
votacion\W*
(?P<votacion_tipo>nominal)
(?P<periodo_numero>\d+)\W*
periodo\ legislativo\W*
(?P<periodo_tipo>ordinario|extraordinario)\W*
(?P<sesion_numero>\d+)\W*
sesion\ *
(?P<sesion_tipo>.*?(?=\ +-))\D*
(?P<reunion_numero>\d+)$""", re.X)
# header part 3:
_RX_HEADER_3 = re.compile(ur"""
^(?P<acta_num>\d+)
Ult\.Mod\.Ver\W*(?P<ult_mod_ver>\d+)
Fecha:(?P<fecha>[/\d]*) # DD/MM/YYYY
Hora:(?P<hora>[:\d]*) # HH:MM
Base.*?:(?P<mayoria_base>.*?)
Tipo.*?:(?P<mayoria_tipo>.*?)
Tipo.*?:(?P<quorum_tipo>.*)
(?P<resultado>AFIRMATIVO|NEGATIVO)
Miembros\ del\ cuerpo:(?P<miembros_total>\d+)
.*?Presidente:(?P<presidente_apellido>.*?,)\ *
(?P<presidente_nombre>.*?[%(uni_az)s](?=[%(uni_AZ)s]))
""" % _RE_TOOLS, re.X | re.U)
def parse_pdf(self, f):
log.debug(u"Loading PDF...")
doc = slate.PDF(f)
log.debug(u"PDF loaded.")
votes_out = []
for i,raw_page in enumerate(p.decode('utf-8') for p in doc):
if i == 0:
data = self._parse_page(raw_page, parse_headers=True)
headers_out = data['headers']
else:
data = self._parse_page(raw_page, parse_headers=False)
votes_out.extend(data['votes'])
return { 'headers': headers_out, 'votes': votes_out }
def _parse_page(self, raw_page, parse_headers=True):
try:
raw_headers, raw_votes = raw_page.split(u"Apellido y NombreProvinciaBloque Político")
except:
import ipdb; ipdb.set_trace()
out = { 'votes': self._parse_page_votes(raw_votes) }
if parse_headers:
out['headers'] = self._parse_page_headers(raw_headers)
return out
def _parse_page_headers(self, raw_headers):
# this shit aint working for every pdf.
return {}
# Sample header (ignore newlines)
#
# Página 1 de 9Votación Nominal128 - Periodo Legislativo - Ordinario
# - 18º Sesión Especial - 28º ReuniónExpediente 4165-D-09 - O.D 1247 -
# Vot. en Gral. y ParticularActa Nº1Ult.Mod.Ver. 2Fecha:24/11/2010Hora:1
# 3:29Base Mayoria:Votos EmitidosTipo Mayoría:Más de la mitadTipo Quorum
# :Más de la mitadAFIRMATIVOMiembros del cuerpo:257Resultado de la Votac
# ión:Presidente:FELLNER, Eduardo AlfredoIdentificadosSin IdentificarTot
# alDiputadosPresidenteDesempateTotalPresentes2210221Votos Afirmativos21
# 2--212Ausentes36Votos Negativos55--Abstenciones3-3
headers = {}
# header part 1
rh1, rht = raw_headers.split(u"Reunión", 1)
d = self._RX_HEADER_1.match(un(rh1).strip().lower()).groupdict()
headers.update({ # Vertical alignment pleases my mind.
'pagina' : int(d['pagina']),
'periodo_numero' : int(d['periodo_numero']),
'periodo_tipo' : d['periodo_tipo'],
'reunion_numero' : int(d['reunion_numero']),
'sesion_numero' : int(d['sesion_numero']),
'sesion_tipo' : d['sesion_tipo'],
'votacion_tipo' : d['votacion_tipo'] })
# header part 2
rh2, rht = re.split(ur"[%(uni_az)s]Acta N.*?(?=\d)" % _RE_TOOLS, rht)
headers['description'] = rh2
# header part 3
d = self._RX_HEADER_3.match(rht).groupdict()
headers.update({ # Vertical alignment pleases my mind.
'acta_num' : int(d['acta_num']),
'ult_mod_ver' : int(d['ult_mod_ver']), # WTF is this?
'mayoria_base' : un(d['mayoria_base']).lower(),
'mayoria_tipo' : un(d['mayoria_tipo']).lower(),
'miembros_total' : int(d['miembros_total']),
'presidente_apellido' : d['presidente_apellido'].title(),
'presidente_nombre' : d['presidente_nombre'].title(),
'quorum_tipo' : un(d['quorum_tipo']),
'resultado' : VOTE_RESULTS[d['resultado']] })
day, month, year = map(int, d['fecha'].split('/'))
hour, minute = map(int, d['hora'].split(':'))
dt = datetime(year, month, day, hour, minute)
# XXX Harcode timezone to -03:00. DST my balls. Time Zones suck
headers['fecha'] = dt.isoformat() + '-0300'
return headers
def _parse_page_votes(self, raw_votes):
votes = []
raw_vlines = self._RX_SPLIT.split(raw_votes)
for who, result in zip(raw_vlines[::2], raw_vlines[1::2]):
try:
apellido, nombre, partido, distrito = self._RX_WHO.match(who).groups()
except:
print repr(who)
raise
votes.append({
'apellido' : apellido.title(),
'nombre' : nombre.title(),
'partido' : un(partido).lower(),
'distrito' : un(distrito).lower(),
'result' : VOTE_RESULTS[result] })
return votes
PARSERS = {
2010: VotacionNominal2010PDFParser }
if __name__ == '__main__':
import argparse
import json
parser = argparse.ArgumentParser(description=u"Parse Votacion Nominal PDFs as JSON.")
parser.add_argument('--quiet', default=False,
dest='quiet', action='store_true')
parser.add_argument('--debug', default=False,
dest='debug', action='store_true')
parser.add_argument('--year', '-y', required=True,
dest='year', type=int, action='store')
parser.add_argument('--outfile', '-o', metavar='FILE', default=sys.stdout,
dest='outfile', type=argparse.FileType('wb'), action='store')
parser.add_argument('--indent', default=None,
dest='indent', type=int, action='store')
parser.add_argument('file', type=argparse.FileType('rb'))
args = parser.parse_args()
if args.debug:
log.setLevel(logging.DEBUG)
if args.quiet:
log.setLevel(logging.FATAL)
parser = PARSERS[args.year]()
out = json.dumps(parser.parse_pdf(args.file), indent=args.indent)
print >>args.outfile, out
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment