/gist:727607

## gistfile1.py
#!/usr/bin/env python
# coding: utf-8

# Copyright (c) 2010, Renzo Carbonara <gnuk0001@gmail.com>
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
#     * Redistributions of source code must retain the above copyright notice,
#       this list of conditions and the following disclaimer.
#
#     * Redistributions in binary form must reproduce the above copyright
#       notice, this list of conditions and the following disclaimer in the
#       documentation and/or other materials provided with the distribution.
#
#     * Neither the name of the Renzo Carbonara nor the names of its
#       contributors may be used to endorse or promote products derived from
#       this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
# ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


import re
import sys
import logging
import unicodedata
from datetime import datetime

import slate


logging.basicConfig(level=logging.INFO)
log = logging.getLogger(__file__)


VOTE_RESULTS = {
    'AFIRMATIVO' : 'Y',
    'NEGATIVO'   : 'N',
    'AUSENTE'    : 'A',
    'ABSTENCION' : 'X' }

_RE_TOOLS = {
    'uni_AZ': ur"A-ZÑÀÁÄÂÉÈËÊÍÌÏÎÓÒÖÔÚÙÜÛ",
    'uni_az': ur"a-zñàáäâéèëêíìïîóòöôúùüû" }


def un(text):
    """
    Downgrade shinny Unicode text to ASCII-compatible unicode text.

    Example: u'Canción' is downgraded to u'Cancion'.
    """
    return unicodedata.normalize('NFD', text).encode('ascii', 'ignore').decode('ascii')


class VotacionNominal2010PDFParser(object):
    _RX_SPLIT = re.compile(r'(AFIRMATIVO|NEGATIVO|AUSENTE|ABSTENCION)')

    # used to extract nombre, apellido, partido, and distrito from a string like:
    #   'BLANCO de PERALTA, BlancaFrente para la Victoria - PJSanta Cruz'
    _RX_WHO = re.compile(ur"""
            # nombre (whatever antes de la primer coma)
            ^(.*?),\ *
            # apellido (termina con minúscula preciediendo una mayuscula, o parentesis ('(con licensia)'))
            (.*?[%(uni_az)s\)])
            # partido (comienza con mayuscula)
            ([%(uni_AZ)s].*?
                # terminacion:
                (?:
                   # algunos terminan con minuscula o punto
                    [%(uni_az)s\)]
                   # otros terminan en mayuscula[s], (el lookahead acá es el mismo que [AAA])
                   |[%(uni_AZ)s]+(?=[%(uni_AZ)s])
                )
                # hasta lookahedear una mayúscula <--- [AAA]
                (?=[%(uni_AZ)s]))
            # distrito (comienza con mayuscula, todo lo restante)
            (.*)$
        """ % _RE_TOOLS, re.X)

    # header part 1: Up to (not inclunding) 'Reunion'
    _RX_HEADER_1 = re.compile(ur"""
            ^pagina\W*
            (?P<pagina>\d+).*?
            votacion\W*
            (?P<votacion_tipo>nominal)
            (?P<periodo_numero>\d+)\W*
            periodo\ legislativo\W*
            (?P<periodo_tipo>ordinario|extraordinario)\W*
            (?P<sesion_numero>\d+)\W*
            sesion\ *
            (?P<sesion_tipo>.*?(?=\ +-))\D*
            (?P<reunion_numero>\d+)$""", re.X)

    # header part 3:
    _RX_HEADER_3 = re.compile(ur"""
            ^(?P<acta_num>\d+)
            Ult\.Mod\.Ver\W*(?P<ult_mod_ver>\d+)
            Fecha:(?P<fecha>[/\d]*)   # DD/MM/YYYY
            Hora:(?P<hora>[:\d]*)     # HH:MM
            Base.*?:(?P<mayoria_base>.*?)
            Tipo.*?:(?P<mayoria_tipo>.*?)
            Tipo.*?:(?P<quorum_tipo>.*)
            (?P<resultado>AFIRMATIVO|NEGATIVO)
            Miembros\ del\ cuerpo:(?P<miembros_total>\d+)
            .*?Presidente:(?P<presidente_apellido>.*?,)\ *
            (?P<presidente_nombre>.*?[%(uni_az)s](?=[%(uni_AZ)s]))
        """ % _RE_TOOLS, re.X | re.U)


    def parse_pdf(self, f):
        log.debug(u"Loading PDF...")
        doc = slate.PDF(f)
        log.debug(u"PDF loaded.")

        votes_out = []
        for i,raw_page in enumerate(p.decode('utf-8') for p in doc):
            if i == 0:
                data = self._parse_page(raw_page, parse_headers=True)
                headers_out = data['headers']
            else:
                data = self._parse_page(raw_page, parse_headers=False)
            votes_out.extend(data['votes'])
        return { 'headers': headers_out, 'votes': votes_out }

    def _parse_page(self, raw_page, parse_headers=True):
        try:
            raw_headers, raw_votes = raw_page.split(u"Apellido y NombreProvinciaBloque Político")
        except:
            import ipdb; ipdb.set_trace()
        out = { 'votes': self._parse_page_votes(raw_votes) }
        if parse_headers:
            out['headers'] = self._parse_page_headers(raw_headers)
        return out

    def _parse_page_headers(self, raw_headers):
        # this shit aint working for every pdf.
        return {}

        # Sample header (ignore newlines)
        #
        # Página   1 de   9Votación Nominal128 - Periodo Legislativo - Ordinario
        #  - 18º Sesión  Especial - 28º ReuniónExpediente 4165-D-09 - O.D 1247 -
        # Vot. en Gral. y ParticularActa Nº1Ult.Mod.Ver. 2Fecha:24/11/2010Hora:1
        # 3:29Base Mayoria:Votos EmitidosTipo Mayoría:Más de la mitadTipo Quorum
        # :Más de la mitadAFIRMATIVOMiembros del cuerpo:257Resultado de la Votac
        # ión:Presidente:FELLNER, Eduardo AlfredoIdentificadosSin IdentificarTot
        # alDiputadosPresidenteDesempateTotalPresentes2210221Votos Afirmativos21
        # 2--212Ausentes36Votos Negativos55--Abstenciones3-3

        headers = {}

        # header part 1
        rh1, rht = raw_headers.split(u"Reunión", 1)
        d = self._RX_HEADER_1.match(un(rh1).strip().lower()).groupdict()
        headers.update({          # Vertical alignment pleases my mind.
            'pagina'              : int(d['pagina']),
            'periodo_numero'      : int(d['periodo_numero']),
            'periodo_tipo'        :     d['periodo_tipo'],
            'reunion_numero'      : int(d['reunion_numero']),
            'sesion_numero'       : int(d['sesion_numero']),
            'sesion_tipo'         :     d['sesion_tipo'],
            'votacion_tipo'       :     d['votacion_tipo'] })

        # header part 2
        rh2, rht = re.split(ur"[%(uni_az)s]Acta N.*?(?=\d)" % _RE_TOOLS, rht)
        headers['description'] = rh2

        # header part 3
        d = self._RX_HEADER_3.match(rht).groupdict()
        headers.update({              # Vertical alignment pleases my mind.
            'acta_num'            :          int(d['acta_num']),
            'ult_mod_ver'         :          int(d['ult_mod_ver']), # WTF is this?
            'mayoria_base'        :           un(d['mayoria_base']).lower(),
            'mayoria_tipo'        :           un(d['mayoria_tipo']).lower(),
            'miembros_total'      :          int(d['miembros_total']),
            'presidente_apellido' :              d['presidente_apellido'].title(),
            'presidente_nombre'   :              d['presidente_nombre'].title(),
            'quorum_tipo'         :           un(d['quorum_tipo']),
            'resultado'           : VOTE_RESULTS[d['resultado']] })

        day, month, year = map(int, d['fecha'].split('/'))
        hour, minute = map(int, d['hora'].split(':'))
        dt = datetime(year, month, day, hour, minute)
        # XXX Harcode timezone to -03:00. DST my balls. Time Zones suck
        headers['fecha'] = dt.isoformat() + '-0300'

        return headers

    def _parse_page_votes(self, raw_votes):
        votes = []
        raw_vlines = self._RX_SPLIT.split(raw_votes)
        for who, result in zip(raw_vlines[::2], raw_vlines[1::2]):
            try:
                apellido, nombre, partido, distrito = self._RX_WHO.match(who).groups()
            except:
                print repr(who)
                raise
            votes.append({
                'apellido' : apellido.title(),
                'nombre'   : nombre.title(),
                'partido'  : un(partido).lower(),
                'distrito' : un(distrito).lower(),
                'result'   : VOTE_RESULTS[result] })
        return votes


PARSERS = {
    2010: VotacionNominal2010PDFParser }


if __name__ == '__main__':
    import argparse
    import json

    parser = argparse.ArgumentParser(description=u"Parse Votacion Nominal PDFs as JSON.")
    parser.add_argument('--quiet', default=False,
                        dest='quiet', action='store_true')
    parser.add_argument('--debug', default=False,
                        dest='debug', action='store_true')
    parser.add_argument('--year', '-y', required=True,
                        dest='year', type=int, action='store')
    parser.add_argument('--outfile', '-o', metavar='FILE', default=sys.stdout,
                        dest='outfile', type=argparse.FileType('wb'), action='store')
    parser.add_argument('--indent', default=None,
                        dest='indent', type=int, action='store')
    parser.add_argument('file', type=argparse.FileType('rb'))
    args = parser.parse_args()

    if args.debug:
        log.setLevel(logging.DEBUG)
    if args.quiet:
        log.setLevel(logging.FATAL)

    parser = PARSERS[args.year]()
    out = json.dumps(parser.parse_pdf(args.file), indent=args.indent)
    print >>args.outfile, out
	#!/usr/bin/env python
	# coding: utf-8

	# Copyright (c) 2010, Renzo Carbonara <gnuk0001@gmail.com>
	# All rights reserved.
	#
	# Redistribution and use in source and binary forms, with or without
	# modification, are permitted provided that the following conditions are met:
	#
	# * Redistributions of source code must retain the above copyright notice,
	# this list of conditions and the following disclaimer.
	#
	# * Redistributions in binary form must reproduce the above copyright
	# notice, this list of conditions and the following disclaimer in the
	# documentation and/or other materials provided with the distribution.
	#
	# * Neither the name of the Renzo Carbonara nor the names of its
	# contributors may be used to endorse or promote products derived from
	# this software without specific prior written permission.
	#
	# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
	# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
	# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
	# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
	# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
	# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
	# ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
	# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


	import re
	import sys
	import logging
	import unicodedata
	from datetime import datetime

	import slate


	logging.basicConfig(level=logging.INFO)
	log = logging.getLogger(__file__)


	VOTE_RESULTS = {
	'AFIRMATIVO' : 'Y',
	'NEGATIVO' : 'N',
	'AUSENTE' : 'A',
	'ABSTENCION' : 'X' }

	_RE_TOOLS = {
	'uni_AZ': ur"A-ZÑÀÁÄÂÉÈËÊÍÌÏÎÓÒÖÔÚÙÜÛ",
	'uni_az': ur"a-zñàáäâéèëêíìïîóòöôúùüû" }


	def un(text):
	"""
	Downgrade shinny Unicode text to ASCII-compatible unicode text.

	Example: u'Canción' is downgraded to u'Cancion'.
	"""
	return unicodedata.normalize('NFD', text).encode('ascii', 'ignore').decode('ascii')


	class VotacionNominal2010PDFParser(object):
	_RX_SPLIT = re.compile(r'(AFIRMATIVO\|NEGATIVO\|AUSENTE\|ABSTENCION)')

	# used to extract nombre, apellido, partido, and distrito from a string like:
	# 'BLANCO de PERALTA, BlancaFrente para la Victoria - PJSanta Cruz'
	_RX_WHO = re.compile(ur"""
	# nombre (whatever antes de la primer coma)
	^(.?),\
	# apellido (termina con minúscula preciediendo una mayuscula, o parentesis ('(con licensia)'))
	(.*?[%(uni_az)s\)])
	# partido (comienza con mayuscula)
	([%(uni_AZ)s].*?
	# terminacion:
	(?:
	# algunos terminan con minuscula o punto
	[%(uni_az)s\)]
	# otros terminan en mayuscula[s], (el lookahead acá es el mismo que [AAA])
	\|[%(uni_AZ)s]+(?=[%(uni_AZ)s])
	)
	# hasta lookahedear una mayúscula <--- [AAA]
	(?=[%(uni_AZ)s]))
	# distrito (comienza con mayuscula, todo lo restante)
	(.*)$
	""" % _RE_TOOLS, re.X)

	# header part 1: Up to (not inclunding) 'Reunion'
	_RX_HEADER_1 = re.compile(ur"""
	^pagina\W*
	(?P<pagina>\d+).*?
	votacion\W*
	(?P<votacion_tipo>nominal)
	(?P<periodo_numero>\d+)\W*
	periodo\ legislativo\W*
	(?P<periodo_tipo>ordinario\|extraordinario)\W*
	(?P<sesion_numero>\d+)\W*
	sesion\ *
	(?P<sesion_tipo>.?(?=\ +-))\D
	(?P<reunion_numero>\d+)$""", re.X)

	# header part 3:
	_RX_HEADER_3 = re.compile(ur"""
	^(?P<acta_num>\d+)
	Ult\.Mod\.Ver\W*(?P<ult_mod_ver>\d+)
	Fecha:(?P<fecha>[/\d]*) # DD/MM/YYYY
	Hora:(?P<hora>[:\d]*) # HH:MM
	Base.?:(?P<mayoria_base>.?)
	Tipo.?:(?P<mayoria_tipo>.?)
	Tipo.?:(?P<quorum_tipo>.)
	(?P<resultado>AFIRMATIVO\|NEGATIVO)
	Miembros\ del\ cuerpo:(?P<miembros_total>\d+)
	.?Presidente:(?P<presidente_apellido>.?,)\ *
	(?P<presidente_nombre>.*?[%(uni_az)s](?=[%(uni_AZ)s]))
	""" % _RE_TOOLS, re.X \| re.U)


	def parse_pdf(self, f):
	log.debug(u"Loading PDF...")
	doc = slate.PDF(f)
	log.debug(u"PDF loaded.")

	votes_out = []
	for i,raw_page in enumerate(p.decode('utf-8') for p in doc):
	if i == 0:
	data = self._parse_page(raw_page, parse_headers=True)
	headers_out = data['headers']
	else:
	data = self._parse_page(raw_page, parse_headers=False)
	votes_out.extend(data['votes'])
	return { 'headers': headers_out, 'votes': votes_out }

	def _parse_page(self, raw_page, parse_headers=True):
	try:
	raw_headers, raw_votes = raw_page.split(u"Apellido y NombreProvinciaBloque Político")
	except:
	import ipdb; ipdb.set_trace()
	out = { 'votes': self._parse_page_votes(raw_votes) }
	if parse_headers:
	out['headers'] = self._parse_page_headers(raw_headers)
	return out

	def _parse_page_headers(self, raw_headers):
	# this shit aint working for every pdf.
	return {}

	# Sample header (ignore newlines)
	#
	# Página 1 de 9Votación Nominal128 - Periodo Legislativo - Ordinario
	# - 18º Sesión Especial - 28º ReuniónExpediente 4165-D-09 - O.D 1247 -
	# Vot. en Gral. y ParticularActa Nº1Ult.Mod.Ver. 2Fecha:24/11/2010Hora:1
	# 3:29Base Mayoria:Votos EmitidosTipo Mayoría:Más de la mitadTipo Quorum
	# :Más de la mitadAFIRMATIVOMiembros del cuerpo:257Resultado de la Votac
	# ión:Presidente:FELLNER, Eduardo AlfredoIdentificadosSin IdentificarTot
	# alDiputadosPresidenteDesempateTotalPresentes2210221Votos Afirmativos21
	# 2--212Ausentes36Votos Negativos55--Abstenciones3-3

	headers = {}

	# header part 1
	rh1, rht = raw_headers.split(u"Reunión", 1)
	d = self._RX_HEADER_1.match(un(rh1).strip().lower()).groupdict()
	headers.update({ # Vertical alignment pleases my mind.
	'pagina' : int(d['pagina']),
	'periodo_numero' : int(d['periodo_numero']),
	'periodo_tipo' : d['periodo_tipo'],
	'reunion_numero' : int(d['reunion_numero']),
	'sesion_numero' : int(d['sesion_numero']),
	'sesion_tipo' : d['sesion_tipo'],
	'votacion_tipo' : d['votacion_tipo'] })

	# header part 2
	rh2, rht = re.split(ur"[%(uni_az)s]Acta N.*?(?=\d)" % _RE_TOOLS, rht)
	headers['description'] = rh2

	# header part 3
	d = self._RX_HEADER_3.match(rht).groupdict()
	headers.update({ # Vertical alignment pleases my mind.
	'acta_num' : int(d['acta_num']),
	'ult_mod_ver' : int(d['ult_mod_ver']), # WTF is this?
	'mayoria_base' : un(d['mayoria_base']).lower(),
	'mayoria_tipo' : un(d['mayoria_tipo']).lower(),
	'miembros_total' : int(d['miembros_total']),
	'presidente_apellido' : d['presidente_apellido'].title(),
	'presidente_nombre' : d['presidente_nombre'].title(),
	'quorum_tipo' : un(d['quorum_tipo']),
	'resultado' : VOTE_RESULTS[d['resultado']] })

	day, month, year = map(int, d['fecha'].split('/'))
	hour, minute = map(int, d['hora'].split(':'))
	dt = datetime(year, month, day, hour, minute)
	# XXX Harcode timezone to -03:00. DST my balls. Time Zones suck
	headers['fecha'] = dt.isoformat() + '-0300'

	return headers

	def _parse_page_votes(self, raw_votes):
	votes = []
	raw_vlines = self._RX_SPLIT.split(raw_votes)
	for who, result in zip(raw_vlines[::2], raw_vlines[1::2]):
	try:
	apellido, nombre, partido, distrito = self._RX_WHO.match(who).groups()
	except:
	print repr(who)
	raise
	votes.append({
	'apellido' : apellido.title(),
	'nombre' : nombre.title(),
	'partido' : un(partido).lower(),
	'distrito' : un(distrito).lower(),
	'result' : VOTE_RESULTS[result] })
	return votes


	PARSERS = {
	2010: VotacionNominal2010PDFParser }


	if __name__ == '__main__':
	import argparse
	import json

	parser = argparse.ArgumentParser(description=u"Parse Votacion Nominal PDFs as JSON.")
	parser.add_argument('--quiet', default=False,
	dest='quiet', action='store_true')
	parser.add_argument('--debug', default=False,
	dest='debug', action='store_true')
	parser.add_argument('--year', '-y', required=True,
	dest='year', type=int, action='store')
	parser.add_argument('--outfile', '-o', metavar='FILE', default=sys.stdout,
	dest='outfile', type=argparse.FileType('wb'), action='store')
	parser.add_argument('--indent', default=None,
	dest='indent', type=int, action='store')
	parser.add_argument('file', type=argparse.FileType('rb'))
	args = parser.parse_args()

	if args.debug:
	log.setLevel(logging.DEBUG)
	if args.quiet:
	log.setLevel(logging.FATAL)

	parser = PARSERS[args.year]()
	out = json.dumps(parser.parse_pdf(args.file), indent=args.indent)
	print >>args.outfile, out