-
-
Save agentcooper/4c55133f5d95866acdee5017cd318558 to your computer and use it in GitHub Desktop.
from PyPDF2 import PdfFileWriter, PdfFileReader | |
from PyPDF2Highlight import createHighlight, addHighlightToPage | |
pdfInput = PdfFileReader(open("input.pdf", "rb")) | |
pdfOutput = PdfFileWriter() | |
page1 = pdfInput.getPage(0) | |
highlight = createHighlight(100, 400, 400, 500, { | |
"author": "", | |
"contents": "Bla-bla-bla" | |
}) | |
addHighlightToPage(highlight, page1, pdfOutput) | |
pdfOutput.addPage(page1) | |
outputStream = open("output.pdf", "wb") | |
pdfOutput.write(outputStream) |
from PyPDF2.generic import ( | |
DictionaryObject, | |
NumberObject, | |
FloatObject, | |
NameObject, | |
TextStringObject, | |
ArrayObject | |
) | |
# x1, y1 starts in bottom left corner | |
def createHighlight(x1, y1, x2, y2, meta, color = [1, 0, 0]): | |
newHighlight = DictionaryObject() | |
newHighlight.update({ | |
NameObject("/F"): NumberObject(4), | |
NameObject("/Type"): NameObject("/Annot"), | |
NameObject("/Subtype"): NameObject("/Highlight"), | |
NameObject("/T"): TextStringObject(meta["author"]), | |
NameObject("/Contents"): TextStringObject(meta["contents"]), | |
NameObject("/C"): ArrayObject([FloatObject(c) for c in color]), | |
NameObject("/Rect"): ArrayObject([ | |
FloatObject(x1), | |
FloatObject(y1), | |
FloatObject(x2), | |
FloatObject(y2) | |
]), | |
NameObject("/QuadPoints"): ArrayObject([ | |
FloatObject(x1), | |
FloatObject(y2), | |
FloatObject(x2), | |
FloatObject(y2), | |
FloatObject(x1), | |
FloatObject(y1), | |
FloatObject(x2), | |
FloatObject(y1) | |
]), | |
}) | |
return newHighlight | |
def addHighlightToPage(highlight, page, output): | |
highlight_ref = output._addObject(highlight); | |
if "/Annots" in page: | |
page[NameObject("/Annots")].append(highlight_ref) | |
else: | |
page[NameObject("/Annots")] = ArrayObject([highlight_ref]) |
Thanks for sharing 👍
I just want to know if there is a way to extract QuadPoints from certain regions in our pdf?
Could you please explain the code. And also why did you import the objects such as DictionaryObject etc
You should use writer.add_annotation(page_number=0, annotation=annotation)
instead of addHighlightToPage(highlight, page1, pdfOutput)
I will likely soon add official support for highlight annotations to pypdf. See https://pypdf.readthedocs.io/en/latest/user/adding-pdf-annotations.html
This is very helpful! I took it bit further and use pdfminer to find the text that you're trying to highlight, optionally constraining that search space to a bounding box. PDF makes this incredibly complicated - you have to find the coordinates of every letter on the page, cluster that into bounding boxes line-by-line, and then highlight that polygon. Hope this code helps someone the way @agentcooper's helped me.
You can then take the result of highlight_annotation
and pass that on to writer.add_annotation
as @MartinThoma suggested
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTChar, LTPage, LTTextContainer, LTTextLine
from pypdf import PdfReader, PdfWriter
from pypdf.generic import (ArrayObject, DictionaryObject, FloatObject, NameObject, NumberObject,
TextStringObject)
def is_within_bbox(bbox: list[float], constraint_bbox: list[float], margin = 10):
x0, y0, x1, y1 = bbox
cx0, cy0, cx1, cy1 = constraint_bbox
# adjust for margin
return cx0 <= x0 + margin and cy0 <= y0 + margin and cx1 >= x1 - margin and cy1 >= y1 - margin
def extract_char_bboxes(page_layout: LTPage, constraint_bbox: list[float] | None = None):
text = ""
char_bboxes: list[list[float]] = []
for element in page_layout:
if isinstance(element, LTTextContainer):
for text_line in element:
if isinstance(text_line, LTTextLine):
for char_obj in text_line:
if isinstance(char_obj, LTChar) and (constraint_bbox is None or is_within_bbox(list(char_obj.bbox), constraint_bbox)):
text += char_obj.get_text()
char_bboxes.append(list(char_obj.bbox))
return text, char_bboxes
def find_text_bbox(pdf, target_text: str, constraint_bbox: list[float] | None = None) -> list[list[float]]:
bbox = None
for page_layout in extract_pages(pdf):
text, char_bboxes = extract_char_bboxes(page_layout, constraint_bbox)
if target_text in text:
start_index = text.index(target_text)
end_index = start_index + len(target_text)
target_chars_bbox = char_bboxes[start_index:end_index]
lines = list(set(bbox[1] for bbox in target_chars_bbox))
target_chars_bbox_lines = [[] for _ in lines]
for char_bbox in target_chars_bbox:
line = lines.index(char_bbox[1])
target_chars_bbox_lines[line].append(char_bbox)
bbox = []
for line in target_chars_bbox_lines:
x0, y0, x1, y1 = [list(sub_list) for sub_list in zip(*line)]
bbox.append([min(x0), min(y0), max(x1), max(y1)])
break
if bbox is None:
raise ValueError(f"Text '{target_text}' not found in the PDF.")
else:
return bbox
# x1, y1 starts in bottom left corner
def highlight_annotation(bounds: list[list[float]], author: str, contents: str, color = [1, 0, 0]):
# The rectangle that bounds the whole highlight
x0, y0, x1, y1 = [list(sub_list) for sub_list in zip(*bounds)]
rect_bbox = [min(x0), min(y0), max(x1), max(y1)]
# Quad points include corners for each line of highlight
quad_points = []
for bbox in bounds:
x1, y1, x2, y2 = bbox
quad_points.extend([x1, y2, x2, y2, x1, y1, x2, y1])
newHighlight = DictionaryObject({
NameObject("/F"): NumberObject(4), # No zoom effect when the page is resized
NameObject("/Type"): NameObject("/Annot"),
NameObject("/Subtype"): NameObject("/Highlight"),
NameObject("/T"): TextStringObject(author),
NameObject("/Contents"): TextStringObject(contents),
NameObject("/C"): ArrayObject([FloatObject(c) for c in color]),
NameObject("/Rect"): ArrayObject([FloatObject(c) for c in rect_bbox]),
NameObject("/QuadPoints"): ArrayObject([FloatObject(c) for c in quad_points]),
})
return newHighlight
Updated for new versions of PyPDF2
that don't support PDFFileReader
and PDFFileWriter
:
from PyPDF2 import PdfReader, PdfWriter
from PyPDF2.generic import (
DictionaryObject,
NumberObject,
FloatObject,
NameObject,
TextStringObject,
ArrayObject
)
def createHighlight(x1, y1, x2, y2, meta, color=[1, 0, 0]):
newHighlight = DictionaryObject()
newHighlight.update({
NameObject("/F"): NumberObject(4),
NameObject("/Type"): NameObject("/Annot"),
NameObject("/Subtype"): NameObject("/Highlight"),
NameObject("/T"): TextStringObject(meta["author"]),
NameObject("/Contents"): TextStringObject(meta["contents"]),
NameObject("/C"): ArrayObject([FloatObject(c) for c in color]),
NameObject("/Rect"): ArrayObject([
FloatObject(x1),
FloatObject(y1),
FloatObject(x2),
FloatObject(y2)
]),
NameObject("/QuadPoints"): ArrayObject([
FloatObject(x1),
FloatObject(y2),
FloatObject(x2),
FloatObject(y2),
FloatObject(x1),
FloatObject(y1),
FloatObject(x2),
FloatObject(y1)
]),
})
return newHighlight
def addHighlightToPage(highlight, page, writer):
# Add the highlight annotation to the specified page
if "/Annots" in page:
page["/Annots"].append(highlight)
else:
page[NameObject("/Annots")] = ArrayObject([highlight])
@rbehal You should no longer use PyPDF2 as it's deprecated. Use pypdf. It supports highlights out of the box: https://pypdf.readthedocs.io/en/latest/user/adding-pdf-annotations.html#highlighting
is there a way to search for textstringobjects and use this to highlight them??