Skip to content

Instantly share code, notes, and snippets.

@agentcooper
Created November 17, 2016 22:50
Show Gist options
  • Star 32 You must be signed in to star a gist
  • Fork 9 You must be signed in to fork a gist
  • Save agentcooper/4c55133f5d95866acdee5017cd318558 to your computer and use it in GitHub Desktop.
Save agentcooper/4c55133f5d95866acdee5017cd318558 to your computer and use it in GitHub Desktop.
from PyPDF2 import PdfFileWriter, PdfFileReader
from PyPDF2Highlight import createHighlight, addHighlightToPage
pdfInput = PdfFileReader(open("input.pdf", "rb"))
pdfOutput = PdfFileWriter()
page1 = pdfInput.getPage(0)
highlight = createHighlight(100, 400, 400, 500, {
"author": "",
"contents": "Bla-bla-bla"
})
addHighlightToPage(highlight, page1, pdfOutput)
pdfOutput.addPage(page1)
outputStream = open("output.pdf", "wb")
pdfOutput.write(outputStream)
from PyPDF2.generic import (
DictionaryObject,
NumberObject,
FloatObject,
NameObject,
TextStringObject,
ArrayObject
)
# x1, y1 starts in bottom left corner
def createHighlight(x1, y1, x2, y2, meta, color = [1, 0, 0]):
newHighlight = DictionaryObject()
newHighlight.update({
NameObject("/F"): NumberObject(4),
NameObject("/Type"): NameObject("/Annot"),
NameObject("/Subtype"): NameObject("/Highlight"),
NameObject("/T"): TextStringObject(meta["author"]),
NameObject("/Contents"): TextStringObject(meta["contents"]),
NameObject("/C"): ArrayObject([FloatObject(c) for c in color]),
NameObject("/Rect"): ArrayObject([
FloatObject(x1),
FloatObject(y1),
FloatObject(x2),
FloatObject(y2)
]),
NameObject("/QuadPoints"): ArrayObject([
FloatObject(x1),
FloatObject(y2),
FloatObject(x2),
FloatObject(y2),
FloatObject(x1),
FloatObject(y1),
FloatObject(x2),
FloatObject(y1)
]),
})
return newHighlight
def addHighlightToPage(highlight, page, output):
highlight_ref = output._addObject(highlight);
if "/Annots" in page:
page[NameObject("/Annots")].append(highlight_ref)
else:
page[NameObject("/Annots")] = ArrayObject([highlight_ref])
@debu999
Copy link

debu999 commented Sep 28, 2017

Thanks. It really helps. :) Great utility

@bisgis
Copy link

bisgis commented Feb 19, 2018

Thanks for the help, friend! Could you help with my code? Sorry, I'm a little newbie to Python. My code ask the user for input a city zoning code then print information about it. I'd also like to highlight this zoning code in a pdf document. I tried to copy and paste your code below mine to do the second part, but I get the error "ImportError: cannot import name PdfFileWriter"

What should I do? I'm pasting my code below, and if you want I can pay you for your help!
Thank you so much! Take care!

Prompt to user input zoning code

duncan_code = input("Type the Zoning code: ")

if duncan_code == "R-1":
print ("Urban Residential")
print ("Uses: Single family residential dwelling - Horticulture - Home occupation - Boarding house - Bed and breakfast accommodation - Daycare, nursery school accessory to a residence - Suite as permitted under Section 5.21")
print ("The height of all buildings and strucutre shall nor exceed 7.5 metres, expect for acessory buildings which shall not exceed a height of 4.0 metres")
print ("Setbacks - Residential use: Front 7.5 m - Side (interior) 1.5 m - Side (exterior) 4.5 m - Rear 7.5 m")
print ("Setbacks - Acessory Residential use: Front 17.5 m - Side (interior) 0.5 m - Side (exterior) 4.5 m - Rear 0.5 m")
print ("Minimum floor space area of a single-family residential dwelling shall be not less than 85.0 square m.")
print ("In no case shall a residential accessory building be located closer than 2.5 m from a principal residential dwelling unit.")
print ("The total residential floor area shall not exceed 0.5 times the total horizontal area of the parcel.")

elif duncan_code == "R-2":
print ("Suburban Residential")
print ("Uses: Single family residential dwelling - Two family residential dwelling - Horticulture - Home occupation - Boarding house - Bed and breakfast accommodation - Daycare, nursery school accessory to a residence - Suite as permitted under Section 5.21")
print ("The height of all buildings and strucutre shall nor exceed 7.5 metres, expect for acessory buildings which shall not exceed a height of 4.0 metres")
print ("Setbacks - Residential use: Front 7.5 m - Side (interior) 1.5 m - Side (exterior) 4.5 m - Rear 7.5 m")
print ("Setbacks - Acessory Residential use: Front 17.5 m - Side (interior) 0.5 m - Side (exterior) 4.5 m - Rear 0.5 m")
print ("Minimum floor space area of a single-family residential dwelling shall be not less than 85.0 square m.")
print ("Minimum floor space area of a two-family residential dwelling shall be not less than 60.0 square m.")
print ("In no case shall a residential accessory building be located closer than 2.5 m from a principal residential dwelling unit.")
print ("The total residential floor area shall not exceed 0.5 times the total horizontal area of the parcel.")

elif duncan_code == "RM-1":
print ("Low Density Multi-Family Residential")
print ("Uses: Single family residential dwelling - Two family residential dwelling - Townhouses - Horticulture - Home occupation - Bed and breakfast accommodation - Daycare, nursery school accessory to a permitted use - Suite as permitted under Section 5.21")
print ("The height of any principal building or structure shall not exceed the lesser of 10.0 metres or 2.5 storeys where a half storey is defined as a storey under a sloping roof, the wall plates of which, on at least two opposite walls are not more than 0.6 metres above the finished floor of such storey.")
print ("The height of any accessory building shall not exceed 4.0 metres")
print ("Setbacks - Single and Two Family Residential Dwelling Use: Front 7.5 m - Side (interior) 1.5 m - Side (exterior) 4.5 m - Rear 7.5 m")
print ("Setbacks - Townhouse Use: Front 7.5 m - Side (interior) 3.0 m - Side (exterior) 6.0 m - Rear 7.5 m")
print ("Setbacks - Acessory Residential use: Front 17.5 m - Side (interior) 0.5 m - Side (exterior) 4.5 m - Rear 0.5 m")
print ("In no case shall a residential accessory building be located closer than 2.5 metres from a principal residential dwelling unit.")
print ("Minimum floor space area of a single-family residential dwelling shall be not less than 85.0 square m.")
print ("Minimum floor space area of a two-family residential dwelling shall be not less than 60.0 square m.")
print ("In no case shall a residential accessory building be located closer than 2.5 m from a principal residential dwelling unit.")
print ("The minimum gross floor area for the type of residential dwelling unit shall be: Single Family Dwelling 85 sq. m. - Two Family Dwelling 60 sq. m. - Townhouse with one bedroom: 50 sq. m. - Townhouse with two bedrooms 65 sq. m. - Townhouse withh three bedrooms - 65 sq. m.")
print ("Single family residential dwellings are permitted in this zone with the density subject to the requirements of the minimum lot size for the R-1 (Urban Residential) zone. The parcel may be either a strata or bare land strata development")
print ("Two family residential dwellings are permitted in this zone with the density subject to the requirements of the minimum lot size for the R-2 (Suburban Residential) zone. The parcel may be either a strata or bare land strata development.")
print ("Density - For any parcel in the RM-1 zone, the maximum density of dwelling units shall not exceed 40 units per hectare of parcel area.")

elif duncan_code == "RM-2":
print ("Medium density - Restricted - Multi-Family Residential")
print ("Uses: Apartment - Home Occupation")
print ("The height of any principal building shall not exceed the lesser of 12.0 metres or three stories")
print ("The height of any accessory building shall not exceed 4.0 metres")
print ("Setbacks - Apartment Use: Front 7.5 m - Side (interior) 3.0 m - Side (exterior) 6.0 m - Rear 7.5 m")
print ("Setbacks - Acessory Residential Use: Front 17.5 m - Side (interior) 0.5 m - Side (exterior) 4.5 m - Rear 0.5 m")
print ("The minimum gross floor area for the type of residential dwelling unit shall be: One Bedroom Single Occupancy: 46-50 sq. m. - One Bedroom Double Occupancy: 50-56 sq. m. - One Bedroom Wheelchair Accessible: 52-56 sq. m. - Townhouse with two bedrooms 65 sq. m. - Townhouse withh three bedrooms - 65 sq. m.")
print ("For any parcel in the RM-2 Zone, the maximum density shall not exceed 130 residential dwelling units per hectare.")

elif duncan_code == "RM-3":
print ("Medium density - 3 storey - Multi-Family Residential")
print ("Uses: Townhouse - Apartment - Horticulture - Parking as a principal use provided it is for the exclusive use of a permitted principal use on an adjacent parcel zoned P-1 Institutional which meets the same requirements specified in subsection 4.4")
print ("The height of any principal building shall not exceed the lesser of 12.0 metres or three habitable stories")
print ("the height of any accessory building shall not exceed 4.0 metres")
print ("Setbacks - Townhouse and Apartment Use: Front 7.5 m - Side (interior) 3.0 m - Side (exterior) 6.0 m - Rear 7.5 m")
print ("Setbacks - Acessory Residential Use - Front 17.5 m - Side (interior) 0.5 m - Side (exterior) 4.5 m - Rear 0.5 m")
print ("The minimum gross floor area for the type of residential dwelling unit shall be: Apartment (Bachelor unit): 33 sq. m. - Apartment (One bedroom): 50 sq. m. - Apartment (Two bedroom): 65 sq. m. - Apartment (Three bedroom) 85 sq. m.")
print ("For any parcel in the RM-3 zone, the maximum density of dwelling units shall be 100 per hectare.")

elif duncan_code == "RM-4":
print ("Medium density - 4 storey - Multi-Family Residential")
print ("Uses: Townhouse - Apartment - Horticulture")
print ("The height of any principal building shall not exceed the lesser of 13.5m or four habitable storeys. The height of any accessory building shall not exceed 4.0 metres")
print ("Setbacks - Townhouse and Apartment - Front 7.5 m - Side (interior) 3.0 m - Side (exterior) 6.0 m - Rear 7.5 m")
print ("Setbacks - Acessory Residential Use - Front 17.5 m - Side (interior) 0.5 m - Side (exterior) 4.5 m - Rear 0.5 m")
print ("The minimum gross floor area for the type of residential dwelling unit shall be: Apartment (Bachelor unit): 33 sq. m. - Apartment (One bedroom): 50 sq. m. - Apartment (Two bedroom): 65 sq. m. - Apartment (Three bedroom) 85 sq. m.")
print ("For any parcel in the RM-4 zone, the maximum density of dwelling units shall be 100 per hectare.")

elif duncan_code == "RM-6":
print ("High density - 6 storey - Residential Commercial")
print ("Uses: Apartment - Private hospital, commercial care facility, rest home - Theatres, auditorium, places of recreation - Business of professional offices, banks - Retail stores - Laundry, dry cleaning and personal service uses - Government, institutional, schools, churches - Premises lincesed for the sale of alcoholic beverages - Medical laboratory - Catering service, restaurant, excluding drive-in and drive-through - Home occupation")
print ("Any use other than residential shall be restricted to the floor or floors below grade, and the ground floor")
print ("The height of any principal building shall not exceed the lesser of 20.0 metres or 6 storeys")
print ("Off-street surface parking shall not be located in the required setback from a street boundary")
print ("Setbacks - Front 7.5 m - Side (interior) 3.0 m - Side (exterior) 6.0 m - Rear 7.5 m")
print ("For any parcel in the RM-6 zone the maximum density of dwelling units shall not exceed 180 units per hectare of parcel area.")

elif duncan_code == "RM-6-A":
print ("High density Residential")
print ("Uses: Apartment - Privahte hospital, commercial care facility, rest home - Theatres, auditorium, places of recreation - Business or professional offices - Banks - Retail, restricted - Laundry, dry cleaning and personal service uses - Government, institutional schools, churches - Medical laboratory - Catering Service - Restaurant, excluding drive-in and drive-through - Home Occupation")
print ("PROHIBITED Uses: Premises licensed for the sale of alcoholic beverages - Convenience store")
print ("Any use other than residential shall be restricted to the floor or floors below grade, and the ground floor.")
print ("The height of any principal building shall not exceed the lesser of 20.0 metres or 6 storeys")
print ("Off-street surface automobile parking shall not be located in the required setback from a street boundary.")
print ("Setbacks - Front 7.5 m - Side (interior) 3.0 m - Side (exterior) 6.0 m - Rear 7.5 m")
print ("For any parcel in the RM-6-A zone the maximum density of dwelling units shall not exceed 100 units per hectare of parcel area except developments in accordance with 7.9")

elif duncan_code == "C-1":
print ("General Commercial")
print ("Uses: Bakery - Bank, financial institution - Barbershop, beauty parlour - Bus or transportation depot - Churches - Fitness studio, racquet club - Funeral parlour - Home Occupation - Independent school - Live/Work Studio above a permitted General Commercial use - Medical laboratory - Office use - Premise licensed for the sale of alcoholic beverages - Printing and publishing - Repair and servicing of personal and household goods, power tools, eletric and electronic goods, power tools, electric and electronic goods - Residential use above a permitted General Commercial use - Restaurant, catering, but excluding drive-in and drive-through - Retail store, including wholesale sales use - Seniors Centre - Shoe repair, retail laundry, dry cleaning and other home and personal service use - Theatre, art gallery, museum - Tourist accommodation - Vehicle parking and storage use - Veterinary clinic")
print ("Acessory Uses: Commercial storage")
print ("The maximum floor space ratio shall not exceed 3:1 for all buildings and structures")
print ("The maximum height for all principle buildings and structures shall not exceed four (4) storeys nor 18m")
print ("The maximum height for all accessory buildings and structures shall not exceed 4.5m")
print ("Setbacks - Front 0 m - Side (interior) 0 m - Side (exterior) 0 m - Rear 4.5 m")
print ("All commercial storage facilities shall be within a basement or located above the first storey only and shall not be permitted within the first storey")
print ("The gross floor area of the commercial storage use, inclusive of hallways, shall not exceed twenty-five (25) % of the total gross floor area of the principal building in which it is located")
print ("Access to the commercial storage shall be via an alley, a parking lot adjacent to the rear or side of a building, or internally within the building only")
print ("no direct access to the commercial storage by any part of the building Fronting a public street is permitted")
print ("No outdoor storage is permitted")
print ("For any parcel in the C-1 zone, the maximum density of residential dwelling units shall not exceed 150/ha")

elif duncan_code == "C-1-A":
print ("General Commercial / Microbrewery")
print ("Uses: Bakery - Bank, financial institution - Barbershop, beauty parlour - Bus or transportation depot - Churches - Fitness studio, racquet club - Funeral parlour - Home Occupation - Independent school - Medical laboratory - Microbrewery - Office use - Premise licensed for the sale of alcoholic beverages - Printing and publishing - Repair and servicing of personal and household goods, power tools, electric and electronic goods - Residential use above a permitted General Commercial use - Restaurant, catering, but excluding drive-in and drive- through - Retail store, including wholesale sales use - Shoe repair, retail, laundry and dry cleaning and other home and personal service use - Theatre, art gallery, museum - Tourist accommodation - Vehicle parking and storage use - Veterinary clinic")
print ("The maximum floor space ratio shall not exceed 3:1 for all buildings and structures. The maximum height for all principle buildings and structures shall not exceed four (4) storeys nor 18m. The maximum height for all accessory buildings and structures shall not exceed 4.5m")
print ("Setbacks - Front 0 m - Side (interior) 0 m - Side (exterior) 0 m - Rear 0 m - A parcel line abutting a residential zoned parcel 4.5 m")
print ("For any parcel in the C-1-A zone, the maximum density of residential dwelling units shall not exceed 150/ha")

elif duncan_code == "C-1-B":
print ("General Commercial / Mixed Use")
print ("Uses: Bakery - Bank, financial institution - Barbershop, beauty parlour - Churches - Fitness studio, racquet club - Home Occupation - Independent School - Live/Work Studio above a permitted General Commercial use - Medical laboratory - Office use - Premise licensed for the sale of alcoholic beverages - Printing and publishing - Repair and servicing of personal and household goods, power tools, electric and electronic goods - Residential use above a permitted General Commercial use - Restaurant, catering, but excluding drive -in and drive -through - Retail store, including wholesale use - Seniors Centre - Shoe repair, retail, laundry and dry cleaning and other home and personal service use - Theatre, art gallery, museum - Tourist accommodation - Veterinary clinic")
print ("The maximum floor space ratio shall not exceed 3:1 for all buildings and structures - The maximum height for all principle buildings and structures shall not exceed five (5) storeys nor 18m - The maximum height for all accessory buildings and structures shall not exceed 4.5m")
print ("Setbacks (Minimum / Maximum - Front 1.5 m at the ground level at least 2 m back from the ground level building Front for the 4th & 5th storeys / 4 m at ground level and 7 m for 4th & 5th storeys - Side (interior) 0 / 0 - Side (exterior) 1.5 at least 2 m back from the ground level building front for the 4th & 5th storeys - 4 m at ground level 7 m for 4th & 5th storeys - Rear 0 / 0 - A parcel line abutting a residential zoned parcel 4.5 m / 0")
print ("Where a maximum front yard setback is required, no more than 50% of the front face of a building façade shall be setback further than the maximum permitted front yard setback.")
print ("For any parcel in the C-1 zone, the maximum density of residential dwelling units shall not exceed 150 units per hectare, except where one (1) parking space per residential unit is provided in an underground structure, the maximum density shall be increased to 200 units per hectare.")

elif duncan_code == "C-2":
print ("Office Commercial")
print ("Uses: Office use - bank, financial institution - retail store, restaurant and home or personal service use - residential use located above a permitted Commercial Use - home occupation - Vehicle parking and storage use for a C-1 (General Commercial - or C-2 (Office Commercial - Zoned property located within 153 meters from the subject property.")
print ("The parcel coverage shall not exceed 50 percent for all buildings and structures - The height of all buildings and structures shall not exceed 12.0 metres except for accessory buildings which shall not exceed a height of 4.5 metres")
print ("Setbacks - Front 4.5 m - Side (interior) 1.5 m - Side (exterior) 4.5 m - Rear 7.5 m")
print ("A retail store, restaurant and home or personal service use shall only be located on the ground floor of a principal building (in which the principal use is an office use) and in no case shall the total gross floor area utilized for a retail store use, restaurant, home or personal service use (or combination thereof) exceed 25 percent of the total gross floor area of the principal building in which they are located.")
print ("Minimum gross floor area for apartments - Bachelor unit 33 sq. m. - One bedroom 50 sq. m. - Two bedroom 65 sq. m. - Three bedroom 85 sq. m.")
print ("For any parcel in the C-2 zone, the maximum density of dwelling units shall not exceed 70 units per hectare of parcel area.")

elif duncan_code == "C-3":
print ("Service Commercial")
print ("Uses: Motor vehicle sales, rental, servicing and repair; service station, excluding auto wrecking and storage of wrecked vehicles - retail and wholesale sale of motor vehicle parts and accessories - retail and wholesale sale of building and camping supplies, including ancillary outdoor storage - sale, rental or servicing of mobile homes, recreation vehicles, boats, motorcycles, farm and industrial equipment, power tools and household equipment; - parking garage and parking lot, bus depot or transportation terminal - commercial plant nurseries, horticulture, retail sales of gardening supplies and produce, accessory outdoor storage - restaurants, catering, including drive-in and drive-thru restaurants - convenience store, grocery outlet - office, retail and wholesale sales and warehousing - car wash, laundromat - cabaret - single family residential use as an accessory use to a use permitted in Section 8.3 - tourist accommodation - bank, financial institution - personal service establishment - dog grooming - indoor dog training")
print ("The parcel coverage shall not exceed 50 percent for all buildings and structures - the height of all buildings and structures shall not exceed 12 metres except for accessory buildings which shall not exceed a height of 7.5 metres")
print ("Setbacks - Front 4.5 m - Side (interior) 0 m - Side (exterior) 4.5 m - Rear 0 m")
print ("The minimum gross floor area of a principal building shall be not less than 45 square metres")
print ("Outdoor storage accessory to a permitted use shall be permitted under special circumstances.")

elif duncan_code == "C-4":
print ("Tourist Recreational Commercial")
print ("Uses: Tourist accommodations - campground, including recreation vehicle park - rooming house use, boarding house use - retail sales, personal service use, gift shop accessory to a tourist accommodation use - one single family residential dwelling use per parcel accessory to a use permitted in Section 8.4")
print ("The maximum site coverage shall not exceed 40 percent for all buildings and structures - the height of all buildings and structures shall not exceed 12 metres - not less than 30 percent of the site area shall be retained as open space wherein no buildings, structures, parking area, loading area or access driveway shall be located")
print ("The minimum setback for all buildings and structures from any parcel line shall be not less than 7.5 metres")

elif duncan_code == "C-5":
print ("Local Commercial")
print ("Uses: Convenience store - café or coffee shop auxiliary to and in addition to a use in Section 8.5 - one single family residential dwelling per parcel within a principal building where such use is auxiliary to and in addition to a use in Section 8.5")
print ("The parcel coverage shall not exceed 40 percent for any buildings and structures - the height of all buildings and structures shall not exceed 7.5 metres")
print ("Setbacks - Front 7.5 m - Side (interior) 1.5 m - Side (exterior) 4.5 m - Rear 3.0 m")
print ("The minimum gross floor area for an auxiliary single family residential dwelling hall be 70.0 square metres")

elif duncan_code == "P-1":
print ("Institutional")
print ("Uses: Institution - assembly use - personal care facility - public school, private school, including boarding facilities - public parking use - fish hatchery - public works yard and public utility use and - one single family residential dwelling unit per parcel accessory to a permitted use as specified in 9.1")
print ("The parcel coverage shall not exceed 50 percent for all buildings and structures - the height for all buildings and structures shall not exceed 12.0 metres")
print ("Setbacks - Front yard 6.0 m - Side yard 6.0 m - Rear yard 6.0 m")

elif duncan_code == "P-2":
print ("Parks and Recreation")
print ("Uses: Park, park reserve and greenbelt - playground - ecological reserve, wildlife sanctuary - public botanical garden.")
print ("The parcel coverage shall not exceed 15 percent for all buildings and structures -the height for all buildings and structures shall not exceed 12.0 metres")
print ("Setbacks - Front yard 6.0 m - Side yard 6.0 m - Rear yard 6.0 m")

else:
print ("Unknown")

from PyPDF2 import PdfFileWriter, PdfFileReader

from PyPDF2Highlight import createHighlight, addHighlightToPage

pdfInput = PdfFileReader(open("Q:\raimundocorreiafe\GIS329\Proposal"))
pdfOutput = PdfFileWriter()

page1 = pdfInput.getPage(0)

highlight = createHighlight(100, 400, 400, 500, {
"duncan_code": "",
})

addHighlightToPage(highlight, page1, pdfOutput)

pdfOutput.addPage(page1)

outputStream = open("output.pdf")
pdfOutput.write(outputStream)

@wave-DmP
Copy link

is there a way to search for textstringobjects and use this to highlight them??

@ayoyu
Copy link

ayoyu commented Dec 4, 2018

Thanks for sharing 👍
I just want to know if there is a way to extract QuadPoints from certain regions in our pdf?

@BKaurHarpreet
Copy link

Could you please explain the code. And also why did you import the objects such as DictionaryObject etc

@MartinThoma
Copy link

You should use writer.add_annotation(page_number=0, annotation=annotation) instead of addHighlightToPage(highlight, page1, pdfOutput)

I will likely soon add official support for highlight annotations to pypdf. See https://pypdf.readthedocs.io/en/latest/user/adding-pdf-annotations.html

@AdeelK93
Copy link

This is very helpful! I took it bit further and use pdfminer to find the text that you're trying to highlight, optionally constraining that search space to a bounding box. PDF makes this incredibly complicated - you have to find the coordinates of every letter on the page, cluster that into bounding boxes line-by-line, and then highlight that polygon. Hope this code helps someone the way @agentcooper's helped me.

You can then take the result of highlight_annotation and pass that on to writer.add_annotation as @MartinThoma suggested

from pdfminer.high_level import extract_pages
from pdfminer.layout import LTChar, LTPage, LTTextContainer, LTTextLine
from pypdf import PdfReader, PdfWriter
from pypdf.generic import (ArrayObject, DictionaryObject, FloatObject, NameObject, NumberObject,
                           TextStringObject)


def is_within_bbox(bbox: list[float], constraint_bbox: list[float], margin = 10):
    x0, y0, x1, y1 = bbox
    cx0, cy0, cx1, cy1 = constraint_bbox
    # adjust for margin
    return cx0 <= x0 + margin and cy0 <= y0 + margin and cx1 >= x1 - margin and cy1 >= y1 - margin

def extract_char_bboxes(page_layout: LTPage, constraint_bbox: list[float] | None = None):
    text = ""
    char_bboxes: list[list[float]] = []
    for element in page_layout:
        if isinstance(element, LTTextContainer):
            for text_line in element:
                if isinstance(text_line, LTTextLine):
                    for char_obj in text_line:
                        if isinstance(char_obj, LTChar) and (constraint_bbox is None or is_within_bbox(list(char_obj.bbox), constraint_bbox)):
                            text += char_obj.get_text()
                            char_bboxes.append(list(char_obj.bbox))
    return text, char_bboxes


def find_text_bbox(pdf, target_text: str, constraint_bbox: list[float] | None = None) -> list[list[float]]:
    bbox = None
    for page_layout in extract_pages(pdf):
        text, char_bboxes = extract_char_bboxes(page_layout, constraint_bbox)

        if target_text in text:
            start_index = text.index(target_text)
            end_index = start_index + len(target_text)
            target_chars_bbox = char_bboxes[start_index:end_index]
            lines = list(set(bbox[1] for bbox in target_chars_bbox))
            target_chars_bbox_lines = [[] for _ in lines]

            for char_bbox in target_chars_bbox:
                line = lines.index(char_bbox[1])
                target_chars_bbox_lines[line].append(char_bbox)

            bbox = []
            for line in target_chars_bbox_lines:
                x0, y0, x1, y1 = [list(sub_list) for sub_list in zip(*line)]
                bbox.append([min(x0), min(y0), max(x1), max(y1)])

            break

    if bbox is None:
        raise ValueError(f"Text '{target_text}' not found in the PDF.")
    else:
        return bbox

# x1, y1 starts in bottom left corner
def highlight_annotation(bounds: list[list[float]], author: str, contents: str, color = [1, 0, 0]):
    # The rectangle that bounds the whole highlight
    x0, y0, x1, y1 = [list(sub_list) for sub_list in zip(*bounds)]
    rect_bbox = [min(x0), min(y0), max(x1), max(y1)]

    # Quad points include corners for each line of highlight
    quad_points = []
    for bbox in bounds:
        x1, y1, x2, y2 = bbox
        quad_points.extend([x1, y2, x2, y2, x1, y1, x2, y1])

    newHighlight = DictionaryObject({
        NameObject("/F"): NumberObject(4), # No zoom effect when the page is resized
        NameObject("/Type"): NameObject("/Annot"),
        NameObject("/Subtype"): NameObject("/Highlight"),

        NameObject("/T"): TextStringObject(author),
        NameObject("/Contents"): TextStringObject(contents),

        NameObject("/C"): ArrayObject([FloatObject(c) for c in color]),
        NameObject("/Rect"): ArrayObject([FloatObject(c) for c in rect_bbox]),
        NameObject("/QuadPoints"): ArrayObject([FloatObject(c) for c in quad_points]),
    })

    return newHighlight

@rbehal
Copy link

rbehal commented Feb 18, 2024

Updated for new versions of PyPDF2 that don't support PDFFileReader and PDFFileWriter:

from PyPDF2 import PdfReader, PdfWriter
from PyPDF2.generic import (
    DictionaryObject,
    NumberObject,
    FloatObject,
    NameObject,
    TextStringObject,
    ArrayObject
)

def createHighlight(x1, y1, x2, y2, meta, color=[1, 0, 0]):
    newHighlight = DictionaryObject()

    newHighlight.update({
        NameObject("/F"): NumberObject(4),
        NameObject("/Type"): NameObject("/Annot"),
        NameObject("/Subtype"): NameObject("/Highlight"),

        NameObject("/T"): TextStringObject(meta["author"]),
        NameObject("/Contents"): TextStringObject(meta["contents"]),

        NameObject("/C"): ArrayObject([FloatObject(c) for c in color]),
        NameObject("/Rect"): ArrayObject([
            FloatObject(x1),
            FloatObject(y1),
            FloatObject(x2),
            FloatObject(y2)
        ]),
        NameObject("/QuadPoints"): ArrayObject([
            FloatObject(x1),
            FloatObject(y2),
            FloatObject(x2),
            FloatObject(y2),
            FloatObject(x1),
            FloatObject(y1),
            FloatObject(x2),
            FloatObject(y1)
        ]),
    })

    return newHighlight

def addHighlightToPage(highlight, page, writer):
    # Add the highlight annotation to the specified page
    if "/Annots" in page:
        page["/Annots"].append(highlight)
    else:
        page[NameObject("/Annots")] = ArrayObject([highlight])

@MartinThoma
Copy link

MartinThoma commented Feb 22, 2024

@rbehal You should no longer use PyPDF2 as it's deprecated. Use pypdf. It supports highlights out of the box: https://pypdf.readthedocs.io/en/latest/user/adding-pdf-annotations.html#highlighting

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment