-
-
Save anilkilic/87f2daefed22620adca8c0b5fd970660 to your computer and use it in GitHub Desktop.
import requests | |
import json | |
from pathlib import Path | |
def get_json(): | |
data = { | |
'tipoExportacion': 'Productos', | |
'dataDto': '{"nombreComercial":"","titular":"","numRegistro":"","fabricante":"","idSustancia":null,"idAmbito":null,"idPlaga":null,"idFuncion":null,"idEstado":"1","idCultivo":null,"idSistemaCultivo":null,"idTipoUsuario":null,"ancestros":false,"fecRenoDesde":"","fecRenoHasta":"","fecInscDesde":"","fecInscHasta":"","fecModiDesde":"","fecModiHasta":"","fecCaduDesde":"","fecCaduHasta":"","fecLimiDesde":"","fecLimiHasta":""}', | |
} | |
response = requests.post( | |
'https://servicio.mapa.gob.es/regfiweb/Exportaciones/ExportJson', | |
data=data, | |
) | |
data = json.loads(response.json()) | |
return data | |
def save_pdf(id): | |
data = {'idProducto': id,} | |
response = requests.post( | |
'https://servicio.mapa.gob.es/regfiweb/Productos/ExportFichaProductoPdf', | |
data=data, | |
) | |
with open(f"files/{id}.pdf", "wb") as f: | |
print(f"writing to {id}.pdf") | |
f.write(response.content) | |
def main(): | |
data = get_json() | |
content = json.loads(data.get("Contenido")) | |
ids = [c.get("IdProducto") for c in content] | |
print(f"going to download {len(ids)} pdfs") | |
Path("files").mkdir(parents=True, exist_ok=True) | |
for id in ids: | |
save_pdf(id) | |
if __name__ == "__main__": | |
main() |
Hey @Godtooro
I'm not familiar with scrapy. But I looked at it and the code you provided.
To start off;
Spider starts from this url https://servicio.mapa.gob.es/regfiweb#
but this page doesn't contain the product list initially.
https://servicio.mapa.gob.es/regfiweb/Productos/ProductosGrid?NombreComercial=&Titular=&NumRegistro=&Fabricante=&IdSustancia=Elija+una+sustancia&IdEstado=1&IdAmbito=Elija+un+%C3%A1mbito&IdPlaga=Elija+una+plaga&IdFuncion=Elija+una+funci%C3%B3n&IdCultivo=Elija+un+cultivo&IdSistemaCultivo=Elija+un+sistema+de+cultivo&IdTipoUsuario=Elija+un+tipo+usuario&Ancestros=false&FecRenoDesde=&FecRenoHasta=&FecCaduDesde=&FecCaduHasta=&FecModiDesde=&FecModiHasta=&FecInscDesde=&FecInscHasta=&FecLimiDesde=&FecLimiHasta=
this url has the table of products but download urls are generated via javascript using exportFichaProductoPdf
on the data-id
. So there is no direct file_url to grab.
So unfortunately I won't be able to help.
Good luck.
Hi @anilkilic !
Thank you so much! With your help I've a start point.
I'm going to keep investigating. I find the solution I'll add it here!
Hi @anilkilic !
To be honest this is awesome!
I'm totally beginner, could you help me to integrate it to the spider please?
Do I must change the settings.py?
Now is as follows:
And items.py is:
And to crawl is: scrapy crawl requests ?