Skip to content

Instantly share code, notes, and snippets.

@v-sukt
Forked from Samathy/dumppdfcomments.py
Last active February 29, 2024 06:44
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 2 You must be signed in to fork a gist
  • Save v-sukt/a59f549864d8369162a237e608251b07 to your computer and use it in GitHub Desktop.
Save v-sukt/a59f549864d8369162a237e608251b07 to your computer and use it in GitHub Desktop.
Python Script to extract highlighted text, images(square/rectangle - e.g the table you highlight with box) and Text annotations from PDFs. Uses python-poppler-qt5 and PyQt5. Updated https://stackoverflow.com/questions/21050551/extracting-text-from-higlighted-text-using-poppler-qt4-python-poppler-qt4 with some minute modifications.
import popplerqt5
import sys
import PyQt5
resolution = 150
def main():
doc = popplerqt5.Poppler.Document.load(sys.argv[1])
total_annotations = 0
for i in range(doc.numPages()):
#print("========= PAGE {} =========".format(i+1))
page = doc.page(i)
annotations = page.annotations()
(pwidth, pheight) = (page.pageSize().width(), page.pageSize().height())
count = 0
if len(annotations) > 0:
for annotation in annotations:
if isinstance(annotation, popplerqt5.Poppler.Annotation):
total_annotations += 1
if (isinstance(annotation, popplerqt5.Poppler.HighlightAnnotation)):
quads = annotation.highlightQuads()
txt = ""
for quad in quads:
rect = (quad.points[0].x() * pwidth,
quad.points[0].y() * pheight,
quad.points[2].x() * pwidth,
quad.points[2].y() * pheight)
bdy = PyQt5.QtCore.QRectF()
bdy.setCoords(*rect)
txt = txt + str(page.text(bdy)) + ' '
#print("========= ANNOTATION =========")
print(txt)
if annotation.contents():
print("\t - {}".format(annotation.contents()))
if isinstance(annotation, popplerqt5.Poppler.GeomAnnotation):
count += 1
bounds = annotation.boundary()
# default we have height/width as per 72p rendering so converting to different resolution
(width, height) = (pwidth*resolution/72, pheight*resolution/72)
bdy = PyQt5.QtCore.QRectF(
bounds.left()*width,
bounds.top()*height,
bounds.width()*width,
bounds.height()*height
)
page.renderToImage(resolution, resolution, bdy.left(), bdy.top(), bdy.width(), bdy.height()).save("page{}_image{}.png".format(i, count))
print("page{}_image{}.png".format(i, count))
if annotation.contents():
print(annotation.contents())
if isinstance(annotation, popplerqt5.Poppler.TextAnnotation):
if annotation.contents():
print(annotation.contents())
if total_annotations > 0:
pass
else:
print ("no annotations found")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment