Created
May 9, 2024 05:07
-
-
Save vjvelascorios/68fe2be381ac14a1bdcfcd116135a3e5 to your computer and use it in GitHub Desktop.
Download Banxico's Library monthly updates
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import requests | |
from bs4 import BeautifulSoup | |
from urllib.parse import urljoin | |
from datetime import datetime | |
# URL | |
url = "https://www.banxico.org.mx/servicios/boletin-mensual-de-la-biblioteca-del-banco-de-mexi/boletin-mensual-biblioteca-pu.html" | |
# Carpeta local donde se guardarán los archivos PDF | |
local_folder = "C:\\banxicos library updates" | |
# Crear la carpeta local si no existe | |
if not os.path.exists(local_folder): | |
os.makedirs(local_folder) | |
# Realizar la solicitud HTTP | |
response = requests.get(url) | |
if response.status_code == 200: | |
# Analizar el contenido HTML | |
soup = BeautifulSoup(response.content, 'html.parser') | |
# Encontrar todos los enlaces <a> que apuntan a archivos PDF | |
pdf_links = soup.find_all('a', href=lambda href: href and href.endswith('.pdf')) | |
# Descargar los archivos PDF que no existen localmente | |
for link in pdf_links: | |
pdf_url = urljoin(url, link['href']) | |
file_name = os.path.basename(pdf_url) | |
# Obtener la fecha del HTML para renombrar el archivo | |
row = link.find_parent('tr') | |
date_string = row.find('td', class_='bmdateview').text.strip() | |
date = datetime.strptime(date_string, '%d/%m/%y') | |
new_file_name = f"Boletín Banxico {date.strftime('%Y.%m.%d')}.pdf" | |
local_path = os.path.join(local_folder, new_file_name) | |
# Descargar el archivo si no existe localmente | |
if not os.path.exists(local_path): | |
with open(local_path, 'wb') as f: | |
pdf_response = requests.get(pdf_url) | |
f.write(pdf_response.content) | |
print(f"Se ha descargado un nuevo archivo: {local_path}") | |
else: | |
print("No se pudo acceder a la página web") | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment