Skip to content

Instantly share code, notes, and snippets.

@vibragiel
Created February 3, 2013 15:28
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save vibragiel/4702176 to your computer and use it in GitHub Desktop.
Save vibragiel/4702176 to your computer and use it in GitHub Desktop.
Script para convertir los papeles de Bárcenas publicados por EL PAÍS a json.
# -*- coding: utf8 -*-
import re
import json
from lxml import html
try:
from urllib2 import urlopen
except ImportError:
from urllib import urlopen
PAPELES_URL = "http://elpais.com/especiales/2013/caso_barcenas/" \
"todos_los_papeles.html"
money_re = re.compile(u"([0-9\.]+) (ptas|\(¿\?\)|€)")
def parse_number(s):
x = s.replace(".", "").replace(",", ".")
m = re.match(money_re, x)
if m:
debe_float = float(m.groups()[0])
debe_unit = m.groups()[1]
else:
debe_float = None
debe_unit = None
return debe_float, debe_unit
s = urlopen(PAPELES_URL).read()
tree = html.fromstring(s)
trs = tree.xpath("//table[@id='tabla_datos']/tbody/tr")
pagos = []
for tr in trs:
tds = tr.getchildren()
month = tds[0].text
try:
year = int(tds[1].text)
except TypeError, ValueError:
year = None
form = tds[2].text
concept = tds[3].text_content()
debe_float, debe_unit = parse_number(tds[4].text_content())
haber_float, haber_unit = parse_number(tds[5].text_content())
saldo_float, saldo_unit = parse_number(tds[6].text_content())
pagos.append(dict(mes=month,
anyo=year,
forma=form,
concepto=concept,
debe=debe_float,
debe_unidad=debe_unit,
haber=haber_float,
haber_unidad=haber_unit,
saldo=saldo_float,
saldo_unidad=saldo_unit
))
with open("papeles_barcenas.json", "w") as f:
json.dump(pagos, f, indent=2, encoding="utf8")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment