lizettepreiss/FDFParser.py

## FDFParser.py

from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdftypes import resolve1

# I exported the comments I had made in an Adobe Reader DC document to f:temp/stn.fdf.
# Now I wanted to access those comments outside of the Adobe Reader. Here is how I extracted the comments.

fdf_file = open("F:/temp/stn.fdf", 'rb')

parser = PDFParser(fdf_file)
doc = PDFDocument(parser)

# Note re the next line in this code:
# You might need to put a breakpoint here and actually look at the 'doc.catalog' variable to see what
# the catalog value is and replace the one below with yours. 'FDF' and 'Annots' were what they were in my case,
# but I've seen other example source code online of how to parse FDF files that had different catalog values.
# I have no idea whether the catalog values vary between different PDF readers and even between
# versions.
# The fdf file I used in this example originated from exporting my comments I made in a PDF when using
# Adobe Reader DC version 2020.006.20042


fields = resolve1(doc.catalog['FDF'])['Annots']

for i in fields:
    field = resolve1(i)

    # Note re the next line in this code:
    # You might need to put a breakpoint here and actually look at the 'field' variable to see what
    # the field names are that you want to extract. 'Page' and 'Content' were what they were in my case,
    # but I've seen other example source code online showing how to parse FDF files that had different
    # field names. I have no idea whether the field names vary between different PDF readers and even between
    # versions.
    # The fdf file I used in this example originated from exporting my comments I made in a PDF when using
    # Adobe Reader DC version 2020.006.20042

    page, b_contents = field.get('Page'), field.get('Contents')

    if b_contents is not None:
        try:
            contents = b_contents.decode()
        except Exception as e:
            pass

        print("The page number where the comment was made is " + str(page))
        print("The contents of your comment is " + contents)

fdf_file.close()

# I then wrote these values to a .xlsx file so that I could use them elsewhere
# (Writing to .xlsx will follow in another gist in due course).

	from pdfminer.pdfparser import PDFParser
	from pdfminer.pdfdocument import PDFDocument
	from pdfminer.pdftypes import resolve1

	# I exported the comments I had made in an Adobe Reader DC document to f:temp/stn.fdf.
	# Now I wanted to access those comments outside of the Adobe Reader. Here is how I extracted the comments.

	fdf_file = open("F:/temp/stn.fdf", 'rb')

	parser = PDFParser(fdf_file)
	doc = PDFDocument(parser)

	# Note re the next line in this code:
	# You might need to put a breakpoint here and actually look at the 'doc.catalog' variable to see what
	# the catalog value is and replace the one below with yours. 'FDF' and 'Annots' were what they were in my case,
	# but I've seen other example source code online of how to parse FDF files that had different catalog values.
	# I have no idea whether the catalog values vary between different PDF readers and even between
	# versions.
	# The fdf file I used in this example originated from exporting my comments I made in a PDF when using
	# Adobe Reader DC version 2020.006.20042


	fields = resolve1(doc.catalog['FDF'])['Annots']

	for i in fields:
	field = resolve1(i)

	# Note re the next line in this code:
	# You might need to put a breakpoint here and actually look at the 'field' variable to see what
	# the field names are that you want to extract. 'Page' and 'Content' were what they were in my case,
	# but I've seen other example source code online showing how to parse FDF files that had different
	# field names. I have no idea whether the field names vary between different PDF readers and even between
	# versions.
	# The fdf file I used in this example originated from exporting my comments I made in a PDF when using
	# Adobe Reader DC version 2020.006.20042

	page, b_contents = field.get('Page'), field.get('Contents')

	if b_contents is not None:
	try:
	contents = b_contents.decode()
	except Exception as e:
	pass

	print("The page number where the comment was made is " + str(page))
	print("The contents of your comment is " + contents)

	fdf_file.close()

	# I then wrote these values to a .xlsx file so that I could use them elsewhere
	# (Writing to .xlsx will follow in another gist in due course).