Skip to content

Instantly share code, notes, and snippets.

@ettorerizza
Last active April 15, 2024 16:50
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ettorerizza/e41bb75e9aeeb20cb7dba7f879b772f9 to your computer and use it in GitHub Desktop.
Save ettorerizza/e41bb75e9aeeb20cb7dba7f879b772f9 to your computer and use it in GitHub Desktop.
List of urls to PDF with headless chrome (Mac)
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import os
import requests
from bs4 import BeautifulSoup
import glob
from PyPDF2 import PdfFileMerger
#Todo: debug this function
def merger(output, input_path):
files = glob.glob('*.pdf')
sorted(filter(os.path.isfile, os.listdir('.')), key=os.path.getmtime)
print("files", files)
pdf_merger = PdfFileMerger()
for file in files:
pdf_merger.append(file)
#os.remove(file)
with open(output, 'wb') as fileobj:
pdf_merger.write(fileobj)
def url_to_pdf(url,
folder,
filename,
CHROME_PATH = r"/Applications/Google\ Chrome.app/Contents/MacOS/Google\ Chrome"):
"""
Use headless chrome CLI tool to print url to pdf
"""
#Todo : add a default path also for Windows and Linux
chrome_args = [CHROME_PATH,
'--headless',
'--disable-gpu',
f'--print-to-pdf={folder}/{filename}.pdf',
url, ]
cmd = r" ".join(chrome_args)
os.system(cmd)
def links_to_pdf(project_name, base_url, css_selector, unique=False):
"""
css_selector: to find links in main page
unique: Do you want a single PDF ?
"""
res = requests.get(base_url)
soup = BeautifulSoup(res.text, 'lxml')
links = soup.select(css_selector)
urls = [base_url + link['href'] for link in links]
# Create target Directory if don't exist
if not os.path.exists(project_name):
os.mkdir(project_name)
print("Directory ", project_name, " Created ")
else:
print("Directory ", project_name, " already exists")
os.chdir(project_name)
visited = []
for url in urls:
url_clean = url.split("#")[0]
if url_clean not in visited:
print(url_clean)
name = url_clean.strip(".html").strip("/").split('/')[-1]
print(name)
try:
url_to_pdf(url_clean, project_name, name)
except Exception as e:
print(e)
visited.append(url_clean)
if unique:
output = project_name + ".pdf"
merger(output, project_name)
if __name__ == '__main__':
project_name = "r-data"
base_url = "https://r4ds.had.co.nz/"
css_selector = ".part+ .chapter a"
links_to_pdf(project_name, base_url, css_selector)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment