Instantly share code, notes, and snippets.

Embed
What would you like to do?
PDF Comment extraction with Python and PDFMIner
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdftypes import PDFObjectNotFound
pages = []
def extract(objid, obj):
global pages
if isinstance(obj, dict):
# 'Type' is PDFObjRef type
if obj.has_key('Type') and obj['Type'].name == 'Page':
pages.append(objid)
elif obj.has_key('C'):
pr = obj['P']
try:
pi = pages.index(pr.objid)+1
except:
pi = -1
print(objid,pi, obj['Subj'],obj['T'],obj['Contents'])
fp = file("y.pdf", 'rb')
parser = PDFParser(fp)
doc = PDFDocument(parser, "")
visited = set()
for xref in doc.xrefs:
for objid in xref.get_objids():
if objid in visited: continue
visited.add(objid)
try:
obj = doc.getobj(objid)
if obj is None: continue
extract(objid,obj)
except PDFObjectNotFound, e:
print >>sys.stderr, 'not found: %r' % e
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment