Skip to content

Instantly share code, notes, and snippets.

@136s
Created September 8, 2023 07:33
Show Gist options
  • Save 136s/4a8baef9457fac067b9151b0b3a1d820 to your computer and use it in GitHub Desktop.
Save 136s/4a8baef9457fac067b9151b0b3a1d820 to your computer and use it in GitHub Desktop.
Remove 'http://localhost' from all links in a PDF file.
#!/usr/bin/env python
import re
import sys
import fitz
def remove_localhost_from_pdf(
pdf_filename: str, pattern: re.Pattern = re.compile(r"^http://localhost:\d+/")
):
"""
Remove 'http://localhost' from all links in a PDF file.
Args:
pdf_filename (str): The filename of the PDF file to process.
pattern (re.Pattern, optional): A regular expression pattern used to identify links to be removed.
Defaults to a pattern that matches 'http://localhost:port/'.
"""
with fitz.open(pdf_filename) as doc:
for page in doc:
for link in page.get_links():
if (uri := link.get("uri")) and pattern.match(uri):
link["uri"] = pattern.sub("", uri)
page.update_link(link)
doc.saveIncr()
if __name__ == "__main__":
for pdf_filename in sys.argv[1:]:
remove_localhost_from_pdf(pdf_filename)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment