Created
March 10, 2016 13:45
-
-
Save anonymous/7a5b23a796631bf881ce to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
from lxml import etree | |
root= 'http://www.agenciatributaria.es' | |
homepage = '/AEAT.internet/Inicio/La_Agencia_Tributaria/Memorias_y_estadisticas_tributarias/Estadisticas/Comercio_exterior/Datos_estadisticos/Descarga_de_Datos_Estadisticos/Descarga_de_datos_mensuales_maxima_desagregacion_en_Euros__centimos_/Descarga_de_datos_mensuales_maxima_desagregacion_en_Euros__centimos_.shtml?mobileView=false' | |
def get_contenido_links(root, url): | |
if not url[0:4]=='http': | |
url = root + url | |
html = requests.get(url, 'utf-8').text | |
parsed = etree.HTML(html) | |
return [link for link in parsed.xpath("//div[@class='contenido']//a")] | |
def get_years(root, url): | |
return [year.attrib['href'] for year in get_contenido_links(root, url) if len(year.text)==4] | |
for year in get_years(root, homepage): | |
for month in get_contenido_links(root, year): | |
for zipfile in get_contenido_links(root, month.attrib['href']): | |
print root+zipfile.attrib['href'] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment