Skip to content

Instantly share code, notes, and snippets.

@ecarreras
Created December 13, 2023 09:35
Show Gist options
  • Save ecarreras/25146a627024d1015d85f46a05d75833 to your computer and use it in GitHub Desktop.
Save ecarreras/25146a627024d1015d85f46a05d75833 to your computer and use it in GitHub Desktop.
import requests
from bs4 import BeautifulSoup
import os
from PyPDF2 import PdfReader, PdfWriter
from rich.progress import track
def merge_pdfs(folder_path, output_filename):
pdf_writer = PdfWriter()
pdfs = [item for item in os.listdir(folder_path) if item.endswith('.pdf')]
for item in track(pdfs, description="Merging..."):
pdf_reader = PdfReader(os.path.join(folder_path, item))
for page in pdf_reader.pages:
pdf_writer.add_page(page)
with open(output_filename, 'wb') as out:
pdf_writer.write(out)
def download_pdfs(url, download_folder):
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
links = soup.find_all('a')
pdf_links = [link for link in links if link.get('href') and link['href'].endswith('.pdf')]
for link in track(pdf_links, description="Downloading..."):
href = link['href']
full_url = requests.compat.urljoin(url, href)
filename = os.path.join(download_folder, href.split('/')[-1])
with requests.get(full_url, stream=True) as r:
r.raise_for_status()
with open(filename, 'wb') as f:
for chunk in r.iter_content(chunk_size=8192):
f.write(chunk)
# URL de la pàgina web
url = "https://www.ree.es/es/actividades/operacion-del-sistema-electrico/procedimientos-de-operacion"
# Carpeta on es desaran els PDFs
download_folder = "/tmp/pos"
#download_pdfs(url, download_folder)
# Nom de l'arxiu de sortida
output_filename = '/tmp/po_unified.pdf'
merge_pdfs(download_folder, output_filename)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment