-
-
Save agentcooper/4c55133f5d95866acdee5017cd318558 to your computer and use it in GitHub Desktop.
from PyPDF2 import PdfFileWriter, PdfFileReader | |
from PyPDF2Highlight import createHighlight, addHighlightToPage | |
pdfInput = PdfFileReader(open("input.pdf", "rb")) | |
pdfOutput = PdfFileWriter() | |
page1 = pdfInput.getPage(0) | |
highlight = createHighlight(100, 400, 400, 500, { | |
"author": "", | |
"contents": "Bla-bla-bla" | |
}) | |
addHighlightToPage(highlight, page1, pdfOutput) | |
pdfOutput.addPage(page1) | |
outputStream = open("output.pdf", "wb") | |
pdfOutput.write(outputStream) |
from PyPDF2.generic import ( | |
DictionaryObject, | |
NumberObject, | |
FloatObject, | |
NameObject, | |
TextStringObject, | |
ArrayObject | |
) | |
# x1, y1 starts in bottom left corner | |
def createHighlight(x1, y1, x2, y2, meta, color = [1, 0, 0]): | |
newHighlight = DictionaryObject() | |
newHighlight.update({ | |
NameObject("/F"): NumberObject(4), | |
NameObject("/Type"): NameObject("/Annot"), | |
NameObject("/Subtype"): NameObject("/Highlight"), | |
NameObject("/T"): TextStringObject(meta["author"]), | |
NameObject("/Contents"): TextStringObject(meta["contents"]), | |
NameObject("/C"): ArrayObject([FloatObject(c) for c in color]), | |
NameObject("/Rect"): ArrayObject([ | |
FloatObject(x1), | |
FloatObject(y1), | |
FloatObject(x2), | |
FloatObject(y2) | |
]), | |
NameObject("/QuadPoints"): ArrayObject([ | |
FloatObject(x1), | |
FloatObject(y2), | |
FloatObject(x2), | |
FloatObject(y2), | |
FloatObject(x1), | |
FloatObject(y1), | |
FloatObject(x2), | |
FloatObject(y1) | |
]), | |
}) | |
return newHighlight | |
def addHighlightToPage(highlight, page, output): | |
highlight_ref = output._addObject(highlight); | |
if "/Annots" in page: | |
page[NameObject("/Annots")].append(highlight_ref) | |
else: | |
page[NameObject("/Annots")] = ArrayObject([highlight_ref]) |
This is very helpful! I took it bit further and use pdfminer to find the text that you're trying to highlight, optionally constraining that search space to a bounding box. PDF makes this incredibly complicated - you have to find the coordinates of every letter on the page, cluster that into bounding boxes line-by-line, and then highlight that polygon. Hope this code helps someone the way @agentcooper's helped me.
You can then take the result of highlight_annotation
and pass that on to writer.add_annotation
as @MartinThoma suggested
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTChar, LTPage, LTTextContainer, LTTextLine
from pypdf import PdfReader, PdfWriter
from pypdf.generic import (ArrayObject, DictionaryObject, FloatObject, NameObject, NumberObject,
TextStringObject)
def is_within_bbox(bbox: list[float], constraint_bbox: list[float], margin = 10):
x0, y0, x1, y1 = bbox
cx0, cy0, cx1, cy1 = constraint_bbox
# adjust for margin
return cx0 <= x0 + margin and cy0 <= y0 + margin and cx1 >= x1 - margin and cy1 >= y1 - margin
def extract_char_bboxes(page_layout: LTPage, constraint_bbox: list[float] | None = None):
text = ""
char_bboxes: list[list[float]] = []
for element in page_layout:
if isinstance(element, LTTextContainer):
for text_line in element:
if isinstance(text_line, LTTextLine):
for char_obj in text_line:
if isinstance(char_obj, LTChar) and (constraint_bbox is None or is_within_bbox(list(char_obj.bbox), constraint_bbox)):
text += char_obj.get_text()
char_bboxes.append(list(char_obj.bbox))
return text, char_bboxes
def find_text_bbox(pdf, target_text: str, constraint_bbox: list[float] | None = None) -> list[list[float]]:
bbox = None
for page_layout in extract_pages(pdf):
text, char_bboxes = extract_char_bboxes(page_layout, constraint_bbox)
if target_text in text:
start_index = text.index(target_text)
end_index = start_index + len(target_text)
target_chars_bbox = char_bboxes[start_index:end_index]
lines = list(set(bbox[1] for bbox in target_chars_bbox))
target_chars_bbox_lines = [[] for _ in lines]
for char_bbox in target_chars_bbox:
line = lines.index(char_bbox[1])
target_chars_bbox_lines[line].append(char_bbox)
bbox = []
for line in target_chars_bbox_lines:
x0, y0, x1, y1 = [list(sub_list) for sub_list in zip(*line)]
bbox.append([min(x0), min(y0), max(x1), max(y1)])
break
if bbox is None:
raise ValueError(f"Text '{target_text}' not found in the PDF.")
else:
return bbox
# x1, y1 starts in bottom left corner
def highlight_annotation(bounds: list[list[float]], author: str, contents: str, color = [1, 0, 0]):
# The rectangle that bounds the whole highlight
x0, y0, x1, y1 = [list(sub_list) for sub_list in zip(*bounds)]
rect_bbox = [min(x0), min(y0), max(x1), max(y1)]
# Quad points include corners for each line of highlight
quad_points = []
for bbox in bounds:
x1, y1, x2, y2 = bbox
quad_points.extend([x1, y2, x2, y2, x1, y1, x2, y1])
newHighlight = DictionaryObject({
NameObject("/F"): NumberObject(4), # No zoom effect when the page is resized
NameObject("/Type"): NameObject("/Annot"),
NameObject("/Subtype"): NameObject("/Highlight"),
NameObject("/T"): TextStringObject(author),
NameObject("/Contents"): TextStringObject(contents),
NameObject("/C"): ArrayObject([FloatObject(c) for c in color]),
NameObject("/Rect"): ArrayObject([FloatObject(c) for c in rect_bbox]),
NameObject("/QuadPoints"): ArrayObject([FloatObject(c) for c in quad_points]),
})
return newHighlight
Updated for new versions of PyPDF2
that don't support PDFFileReader
and PDFFileWriter
:
from PyPDF2 import PdfReader, PdfWriter
from PyPDF2.generic import (
DictionaryObject,
NumberObject,
FloatObject,
NameObject,
TextStringObject,
ArrayObject
)
def createHighlight(x1, y1, x2, y2, meta, color=[1, 0, 0]):
newHighlight = DictionaryObject()
newHighlight.update({
NameObject("/F"): NumberObject(4),
NameObject("/Type"): NameObject("/Annot"),
NameObject("/Subtype"): NameObject("/Highlight"),
NameObject("/T"): TextStringObject(meta["author"]),
NameObject("/Contents"): TextStringObject(meta["contents"]),
NameObject("/C"): ArrayObject([FloatObject(c) for c in color]),
NameObject("/Rect"): ArrayObject([
FloatObject(x1),
FloatObject(y1),
FloatObject(x2),
FloatObject(y2)
]),
NameObject("/QuadPoints"): ArrayObject([
FloatObject(x1),
FloatObject(y2),
FloatObject(x2),
FloatObject(y2),
FloatObject(x1),
FloatObject(y1),
FloatObject(x2),
FloatObject(y1)
]),
})
return newHighlight
def addHighlightToPage(highlight, page, writer):
# Add the highlight annotation to the specified page
if "/Annots" in page:
page["/Annots"].append(highlight)
else:
page[NameObject("/Annots")] = ArrayObject([highlight])
@rbehal You should no longer use PyPDF2 as it's deprecated. Use pypdf. It supports highlights out of the box: https://pypdf.readthedocs.io/en/latest/user/adding-pdf-annotations.html#highlighting
You should use
writer.add_annotation(page_number=0, annotation=annotation)
instead ofaddHighlightToPage(highlight, page1, pdfOutput)
I will likely soon add official support for highlight annotations to pypdf. See https://pypdf.readthedocs.io/en/latest/user/adding-pdf-annotations.html