Skip to content

Instantly share code, notes, and snippets.

@phreeza
Created August 17, 2016 11:12
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save phreeza/d2d1f20ecff5d7768e119bf6ec17a966 to your computer and use it in GitHub Desktop.
Save phreeza/d2d1f20ecff5d7768e119bf6ec17a966 to your computer and use it in GitHub Desktop.
Extract comments from a pdf file and dump them into a markdown file for pretty formatting.
# adapted from http://stackoverflow.com/a/12502560/379300
# Output should be valid markdown, so it can be turned into a nice pdf with pandoc
import poppler
import sys
import urllib
import os
def main():
input_filename = sys.argv[1]
# http://blog.hartwork.org/?p=612
document = poppler.document_new_from_file('file://%s' % \
urllib.pathname2url(os.path.abspath(input_filename)), None)
n_pages = document.get_n_pages()
all_annots = 0
pages = []
comments = []
for i in range(n_pages):
page = document.get_page(i)
annot_mappings = page.get_annot_mapping ()
num_annots = len(annot_mappings)
if num_annots > 0:
pages.append(i)
comments.append([])
for annot_mapping in annot_mappings:
if annot_mapping.annot.get_annot_type().value_name != 'POPPLER_ANNOT_LINK':
if type(annot_mapping.annot.get_contents()) is str:
all_annots += 1
comments[-1].append(annot_mapping.annot.get_contents())
for cl,p in zip(comments,pages):
if len(cl) > 0:
print "Page "+str(p+1)
print "====="
for c in cl:
print '- '+c
print
if all_annots > 0:
print str(all_annots) + " annotation(s) found"
else:
print "no annotations found"
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment