Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
PDF Comment extraction with Python and PDFMIner
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdftypes import PDFObjectNotFound
pages = []
def extract(objid, obj):
global pages
if isinstance(obj, dict):
# 'Type' is PDFObjRef type
if obj.has_key('Type') and obj['Type'].name == 'Page':
pages.append(objid)
elif obj.has_key('C'):
pr = obj['P']
try:
pi = pages.index(pr.objid)+1
except:
pi = -1
print(objid,pi, obj['Subj'],obj['T'],obj['Contents'])
fp = file("y.pdf", 'rb')
parser = PDFParser(fp)
doc = PDFDocument(parser, "")
visited = set()
for xref in doc.xrefs:
for objid in xref.get_objids():
if objid in visited: continue
visited.add(objid)
try:
obj = doc.getobj(objid)
if obj is None: continue
extract(objid,obj)
except PDFObjectNotFound, e:
print >>sys.stderr, 'not found: %r' % e
@babytiger0929

This comment has been minimized.

Copy link

babytiger0929 commented Oct 17, 2019

hi!
How to get the comment data of pdf files with python?
If you know that please help me.
Thank you!

@ckolumbus

This comment has been minimized.

Copy link
Owner Author

ckolumbus commented Nov 13, 2019

@babytiger0929 Sorry, in don't know this off the top of my head.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.