Skip to content

Instantly share code, notes, and snippets.

@kidwellj
Forked from retrography/annotex.py
Created January 17, 2020 15:11
Show Gist options
  • Save kidwellj/b3b21baaadda67e2dad70ec7fc8745be to your computer and use it in GitHub Desktop.
Save kidwellj/b3b21baaadda67e2dad70ec7fc8745be to your computer and use it in GitHub Desktop.
PDF highlight and annotation extractor
#!/usr/bin/env python
__author__ = 'Mahmood S. Zargar'
import poppler
import sys
import urllib
import os
def main():
if sys.argv.__len__() < 2:
print 'Input file required. Please mention at least one.'
print 'Syntax: annotex input_file1.pdf [input_file2.pdf ...]'
sys.exit(1)
for file_name in sys.argv[1:]:
print
document = poppler.document_new_from_file('file://%s' % \
urllib.pathname2url(os.path.abspath(file_name)), None)
print os.path.basename(file_name)
doc_title = document.get_property('title')
if doc_title is not None and doc_title != '':
print '(' + doc_title + ')'
print '-----\n'
n_pages = document.get_n_pages()
all_annots = 0
for i in range(n_pages):
page = document.get_page(i)
annot_mappings = page.get_annot_mapping()
num_annots = len(annot_mappings)
if num_annots > 0:
for annot_mapping in annot_mappings:
annot = annot_mapping.annot
annot_type = annot.get_annot_type().value_nick
annot_type = annot_type[0].upper() + annot_type[1:]
if annot_type in ['Underline', 'Highlight', 'Strike-out', 'Squiggly', 'Text', 'Free-text', 'Caret']:
all_annots += 1
page_no = str(page.get_index() + 1)
page_label = page.props.label
page_prompt = '[p. ' + page_no + ']'
if page_no != page_label:
page_prompt = page_prompt + '(' + page_label + ')'
page_prompt += ': \n'
area = annot_mapping.area.copy()
(width, height) = page.get_size()
area.x1, area.x2 = area.x2, area.x1
area.y1, area.y2 = area.y2, area.y1
area.y1 = height - area.y1
area.y2 = height - area.y2
annot_text = page.get_selected_text("POPPLER_SELECTION_WORD", area).strip()
annot_cont = annot.get_contents()
if annot_text is not None and annot_text not in ['', page_no, page_label]:
print annot_type + ' Text ' + page_prompt + annot_text + '\n'
if annot_cont is not None and annot_cont != '':
print annot_type + ' Note ' + page_prompt + annot_cont + '\n'
print '-----\n' + str(all_annots) + " annotation(s) found" + '\n\n'
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment