Created
July 15, 2024 18:51
-
-
Save g-simmons/0da1dc7ba87a9c5c6ea3940608dcb3b2 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
This script is designed to remove line numbers from PDF files within a specified margin. | |
It uses the pymupdf library to interact with PDFs and the Typer library for command-line interface. | |
The script can be run from the command line, specifying the input PDF file, output PDF file, margin size, | |
and a regular expression to match page numbers. | |
Not thoroughly tested. | |
Gabriel Simmons, 2024 | |
""" | |
import typer | |
import re | |
import fitz | |
import os | |
import logging | |
app = typer.Typer() | |
# Set up logging | |
logging.basicConfig( | |
level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" | |
) | |
logger = logging.getLogger(__name__) | |
def _match_in_rect(match: tuple, rect: tuple) -> bool: | |
""" | |
Check if a match is within a given rectangle. | |
Args: | |
- match (tuple): A tuple containing the coordinates of the match. | |
- rect (tuple): A tuple containing the coordinates of the rectangle. | |
Returns: | |
- bool: True if the match is within the rectangle, False otherwise. | |
""" | |
x0, y0, x1, y1 = match[:4] | |
return x0 > rect[0] and x1 < rect[2] | |
def remove_line_numbers( | |
input_pdf: str, | |
output_pdf: str, | |
margin_inches: float = 1.0, | |
page_number_regex: str = r"\d{3}", | |
) -> None: | |
""" | |
Remove line numbers from a PDF within a specified margin and save the result. | |
Args: | |
- input_pdf (str): The path to the input PDF file. | |
- output_pdf (str): The path to the output PDF file. | |
- margin_inches (float, optional): The margin size in inches. Defaults to 1.0. | |
- page_number_regex (str, optional): The regex to match page numbers. Defaults to r"\d{3}". | |
""" | |
doc = fitz.open(input_pdf) | |
for page in doc: | |
words = page.get_text("words") | |
matches = [w for w in words if re.fullmatch(page_number_regex, w[4])] | |
page_width = page.rect.width | |
page_height = page.rect.height | |
margin_pts = page_width * margin_inches / 8.5 | |
left_margin_rect = (0, 0, margin_pts, page_height) | |
right_margin_rect = (page_width - margin_pts, 0, page_width, page_height) | |
for match in matches: | |
if _match_in_rect(match, left_margin_rect) or _match_in_rect( | |
match, right_margin_rect | |
): | |
page.add_redact_annot(match[:4]) | |
logger.info(f"Redacting match: {match[4]}") | |
page.apply_redactions() | |
doc.save(output_pdf) | |
doc.close() | |
@app.command() | |
def process_pdf( | |
input_pdf: str = typer.Argument(..., help="Input PDF file path"), | |
output_pdf: str = typer.Option(None, help="Output PDF file path"), | |
margin_inches: float = typer.Option(1.0, help="Margin size in inches"), | |
page_number_regex: str = typer.Option(r"\d{3}", help="Regex to match page numbers"), | |
) -> None: | |
""" | |
Remove text within the specified margin of a PDF and save the result. | |
Args: | |
- input_pdf (str): The path to the input PDF file. | |
- output_pdf (str, optional): The path to the output PDF file. Defaults to None. | |
- margin_inches (float, optional): The margin size in inches. Defaults to 1.0. | |
- page_number_regex (str, optional): The regex to match page numbers. Defaults to r"\d{3}". | |
""" | |
if not output_pdf: | |
base, ext = os.path.splitext(input_pdf) | |
output_pdf = f"{base}_processed{ext}" | |
remove_line_numbers(input_pdf, output_pdf, margin_inches, page_number_regex) | |
logger.info(f"Processed PDF saved as: {output_pdf}") | |
if __name__ == "__main__": | |
app() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment