Skip to content

Instantly share code, notes, and snippets.

@g-simmons
Created July 15, 2024 18:51
Show Gist options
  • Save g-simmons/0da1dc7ba87a9c5c6ea3940608dcb3b2 to your computer and use it in GitHub Desktop.
Save g-simmons/0da1dc7ba87a9c5c6ea3940608dcb3b2 to your computer and use it in GitHub Desktop.
"""
This script is designed to remove line numbers from PDF files within a specified margin.
It uses the pymupdf library to interact with PDFs and the Typer library for command-line interface.
The script can be run from the command line, specifying the input PDF file, output PDF file, margin size,
and a regular expression to match page numbers.
Not thoroughly tested.
Gabriel Simmons, 2024
"""
import typer
import re
import fitz
import os
import logging
app = typer.Typer()
# Set up logging
logging.basicConfig(
level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger(__name__)
def _match_in_rect(match: tuple, rect: tuple) -> bool:
"""
Check if a match is within a given rectangle.
Args:
- match (tuple): A tuple containing the coordinates of the match.
- rect (tuple): A tuple containing the coordinates of the rectangle.
Returns:
- bool: True if the match is within the rectangle, False otherwise.
"""
x0, y0, x1, y1 = match[:4]
return x0 > rect[0] and x1 < rect[2]
def remove_line_numbers(
input_pdf: str,
output_pdf: str,
margin_inches: float = 1.0,
page_number_regex: str = r"\d{3}",
) -> None:
"""
Remove line numbers from a PDF within a specified margin and save the result.
Args:
- input_pdf (str): The path to the input PDF file.
- output_pdf (str): The path to the output PDF file.
- margin_inches (float, optional): The margin size in inches. Defaults to 1.0.
- page_number_regex (str, optional): The regex to match page numbers. Defaults to r"\d{3}".
"""
doc = fitz.open(input_pdf)
for page in doc:
words = page.get_text("words")
matches = [w for w in words if re.fullmatch(page_number_regex, w[4])]
page_width = page.rect.width
page_height = page.rect.height
margin_pts = page_width * margin_inches / 8.5
left_margin_rect = (0, 0, margin_pts, page_height)
right_margin_rect = (page_width - margin_pts, 0, page_width, page_height)
for match in matches:
if _match_in_rect(match, left_margin_rect) or _match_in_rect(
match, right_margin_rect
):
page.add_redact_annot(match[:4])
logger.info(f"Redacting match: {match[4]}")
page.apply_redactions()
doc.save(output_pdf)
doc.close()
@app.command()
def process_pdf(
input_pdf: str = typer.Argument(..., help="Input PDF file path"),
output_pdf: str = typer.Option(None, help="Output PDF file path"),
margin_inches: float = typer.Option(1.0, help="Margin size in inches"),
page_number_regex: str = typer.Option(r"\d{3}", help="Regex to match page numbers"),
) -> None:
"""
Remove text within the specified margin of a PDF and save the result.
Args:
- input_pdf (str): The path to the input PDF file.
- output_pdf (str, optional): The path to the output PDF file. Defaults to None.
- margin_inches (float, optional): The margin size in inches. Defaults to 1.0.
- page_number_regex (str, optional): The regex to match page numbers. Defaults to r"\d{3}".
"""
if not output_pdf:
base, ext = os.path.splitext(input_pdf)
output_pdf = f"{base}_processed{ext}"
remove_line_numbers(input_pdf, output_pdf, margin_inches, page_number_regex)
logger.info(f"Processed PDF saved as: {output_pdf}")
if __name__ == "__main__":
app()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment