Skip to content

Instantly share code, notes, and snippets.

@agentcooper
Created November 17, 2016 22:50
Show Gist options
  • Save agentcooper/4c55133f5d95866acdee5017cd318558 to your computer and use it in GitHub Desktop.
Save agentcooper/4c55133f5d95866acdee5017cd318558 to your computer and use it in GitHub Desktop.
from PyPDF2 import PdfFileWriter, PdfFileReader
from PyPDF2Highlight import createHighlight, addHighlightToPage
pdfInput = PdfFileReader(open("input.pdf", "rb"))
pdfOutput = PdfFileWriter()
page1 = pdfInput.getPage(0)
highlight = createHighlight(100, 400, 400, 500, {
"author": "",
"contents": "Bla-bla-bla"
})
addHighlightToPage(highlight, page1, pdfOutput)
pdfOutput.addPage(page1)
outputStream = open("output.pdf", "wb")
pdfOutput.write(outputStream)
from PyPDF2.generic import (
DictionaryObject,
NumberObject,
FloatObject,
NameObject,
TextStringObject,
ArrayObject
)
# x1, y1 starts in bottom left corner
def createHighlight(x1, y1, x2, y2, meta, color = [1, 0, 0]):
newHighlight = DictionaryObject()
newHighlight.update({
NameObject("/F"): NumberObject(4),
NameObject("/Type"): NameObject("/Annot"),
NameObject("/Subtype"): NameObject("/Highlight"),
NameObject("/T"): TextStringObject(meta["author"]),
NameObject("/Contents"): TextStringObject(meta["contents"]),
NameObject("/C"): ArrayObject([FloatObject(c) for c in color]),
NameObject("/Rect"): ArrayObject([
FloatObject(x1),
FloatObject(y1),
FloatObject(x2),
FloatObject(y2)
]),
NameObject("/QuadPoints"): ArrayObject([
FloatObject(x1),
FloatObject(y2),
FloatObject(x2),
FloatObject(y2),
FloatObject(x1),
FloatObject(y1),
FloatObject(x2),
FloatObject(y1)
]),
})
return newHighlight
def addHighlightToPage(highlight, page, output):
highlight_ref = output._addObject(highlight);
if "/Annots" in page:
page[NameObject("/Annots")].append(highlight_ref)
else:
page[NameObject("/Annots")] = ArrayObject([highlight_ref])
@wave-DmP
Copy link

is there a way to search for textstringobjects and use this to highlight them??

@ayoyu
Copy link

ayoyu commented Dec 4, 2018

Thanks for sharing 👍
I just want to know if there is a way to extract QuadPoints from certain regions in our pdf?

@BKaurHarpreet
Copy link

Could you please explain the code. And also why did you import the objects such as DictionaryObject etc

@MartinThoma
Copy link

You should use writer.add_annotation(page_number=0, annotation=annotation) instead of addHighlightToPage(highlight, page1, pdfOutput)

I will likely soon add official support for highlight annotations to pypdf. See https://pypdf.readthedocs.io/en/latest/user/adding-pdf-annotations.html

@AdeelK93
Copy link

This is very helpful! I took it bit further and use pdfminer to find the text that you're trying to highlight, optionally constraining that search space to a bounding box. PDF makes this incredibly complicated - you have to find the coordinates of every letter on the page, cluster that into bounding boxes line-by-line, and then highlight that polygon. Hope this code helps someone the way @agentcooper's helped me.

You can then take the result of highlight_annotation and pass that on to writer.add_annotation as @MartinThoma suggested

from pdfminer.high_level import extract_pages
from pdfminer.layout import LTChar, LTPage, LTTextContainer, LTTextLine
from pypdf import PdfReader, PdfWriter
from pypdf.generic import (ArrayObject, DictionaryObject, FloatObject, NameObject, NumberObject,
                           TextStringObject)


def is_within_bbox(bbox: list[float], constraint_bbox: list[float], margin = 10):
    x0, y0, x1, y1 = bbox
    cx0, cy0, cx1, cy1 = constraint_bbox
    # adjust for margin
    return cx0 <= x0 + margin and cy0 <= y0 + margin and cx1 >= x1 - margin and cy1 >= y1 - margin

def extract_char_bboxes(page_layout: LTPage, constraint_bbox: list[float] | None = None):
    text = ""
    char_bboxes: list[list[float]] = []
    for element in page_layout:
        if isinstance(element, LTTextContainer):
            for text_line in element:
                if isinstance(text_line, LTTextLine):
                    for char_obj in text_line:
                        if isinstance(char_obj, LTChar) and (constraint_bbox is None or is_within_bbox(list(char_obj.bbox), constraint_bbox)):
                            text += char_obj.get_text()
                            char_bboxes.append(list(char_obj.bbox))
    return text, char_bboxes


def find_text_bbox(pdf, target_text: str, constraint_bbox: list[float] | None = None) -> list[list[float]]:
    bbox = None
    for page_layout in extract_pages(pdf):
        text, char_bboxes = extract_char_bboxes(page_layout, constraint_bbox)

        if target_text in text:
            start_index = text.index(target_text)
            end_index = start_index + len(target_text)
            target_chars_bbox = char_bboxes[start_index:end_index]
            lines = list(set(bbox[1] for bbox in target_chars_bbox))
            target_chars_bbox_lines = [[] for _ in lines]

            for char_bbox in target_chars_bbox:
                line = lines.index(char_bbox[1])
                target_chars_bbox_lines[line].append(char_bbox)

            bbox = []
            for line in target_chars_bbox_lines:
                x0, y0, x1, y1 = [list(sub_list) for sub_list in zip(*line)]
                bbox.append([min(x0), min(y0), max(x1), max(y1)])

            break

    if bbox is None:
        raise ValueError(f"Text '{target_text}' not found in the PDF.")
    else:
        return bbox

# x1, y1 starts in bottom left corner
def highlight_annotation(bounds: list[list[float]], author: str, contents: str, color = [1, 0, 0]):
    # The rectangle that bounds the whole highlight
    x0, y0, x1, y1 = [list(sub_list) for sub_list in zip(*bounds)]
    rect_bbox = [min(x0), min(y0), max(x1), max(y1)]

    # Quad points include corners for each line of highlight
    quad_points = []
    for bbox in bounds:
        x1, y1, x2, y2 = bbox
        quad_points.extend([x1, y2, x2, y2, x1, y1, x2, y1])

    newHighlight = DictionaryObject({
        NameObject("/F"): NumberObject(4), # No zoom effect when the page is resized
        NameObject("/Type"): NameObject("/Annot"),
        NameObject("/Subtype"): NameObject("/Highlight"),

        NameObject("/T"): TextStringObject(author),
        NameObject("/Contents"): TextStringObject(contents),

        NameObject("/C"): ArrayObject([FloatObject(c) for c in color]),
        NameObject("/Rect"): ArrayObject([FloatObject(c) for c in rect_bbox]),
        NameObject("/QuadPoints"): ArrayObject([FloatObject(c) for c in quad_points]),
    })

    return newHighlight

@rbehal
Copy link

rbehal commented Feb 18, 2024

Updated for new versions of PyPDF2 that don't support PDFFileReader and PDFFileWriter:

from PyPDF2 import PdfReader, PdfWriter
from PyPDF2.generic import (
    DictionaryObject,
    NumberObject,
    FloatObject,
    NameObject,
    TextStringObject,
    ArrayObject
)

def createHighlight(x1, y1, x2, y2, meta, color=[1, 0, 0]):
    newHighlight = DictionaryObject()

    newHighlight.update({
        NameObject("/F"): NumberObject(4),
        NameObject("/Type"): NameObject("/Annot"),
        NameObject("/Subtype"): NameObject("/Highlight"),

        NameObject("/T"): TextStringObject(meta["author"]),
        NameObject("/Contents"): TextStringObject(meta["contents"]),

        NameObject("/C"): ArrayObject([FloatObject(c) for c in color]),
        NameObject("/Rect"): ArrayObject([
            FloatObject(x1),
            FloatObject(y1),
            FloatObject(x2),
            FloatObject(y2)
        ]),
        NameObject("/QuadPoints"): ArrayObject([
            FloatObject(x1),
            FloatObject(y2),
            FloatObject(x2),
            FloatObject(y2),
            FloatObject(x1),
            FloatObject(y1),
            FloatObject(x2),
            FloatObject(y1)
        ]),
    })

    return newHighlight

def addHighlightToPage(highlight, page, writer):
    # Add the highlight annotation to the specified page
    if "/Annots" in page:
        page["/Annots"].append(highlight)
    else:
        page[NameObject("/Annots")] = ArrayObject([highlight])

@MartinThoma
Copy link

MartinThoma commented Feb 22, 2024

@rbehal You should no longer use PyPDF2 as it's deprecated. Use pypdf. It supports highlights out of the box: https://pypdf.readthedocs.io/en/latest/user/adding-pdf-annotations.html#highlighting

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment