Skip to content

Instantly share code, notes, and snippets.

@kvgc
Last active June 8, 2022 07:42
Show Gist options
  • Save kvgc/704c9af2ab91be19905fb5b31dad551a to your computer and use it in GitHub Desktop.
Save kvgc/704c9af2ab91be19905fb5b31dad551a to your computer and use it in GitHub Desktop.
### pytho exportAnnotations.py test.pdf
### Outputs notes and highlights in the pdf as html
### Requires: markdown2, PyPDF2, pdfannots
from PyPDF2 import PdfFileReader
import subprocess
from pathlib import Path
import markdown2
import sys
# filename = "downloads_firefox/2204.09332.pdf"
filename = str(sys.argv[1])
reader = PdfFileReader(filename)
##print(reader.getPage(0).mediaBox)
finalNote = '<h2> Notes </h2><br>'
pageNum=0
for page in reader.pages:
pageNum+=1
# print(pageNum)
if "/Annots" in page:
for annot in page["/Annots"]:
subtype = annot.get_object()["/Subtype"]
if subtype == "/Text":
# print(annot.get_object()["/Contents"])
# print('''<a href="''' + filename + '''#page=''' + str(pageNum) + '''"></a>''')
finalNote+='''Page:'''+ str(pageNum)+''' <a href="''' + filename + '''#page=''' + str(pageNum) + '''">''' + annot.get_object()["/Contents"] +'''</a><br>'''
if subtype == "/FreeText":
finalNote+='''Page:'''+ str(pageNum)+''' <a href="''' + filename + '''#page=''' + str(pageNum) + '''">''' + annot.get_object()["/Contents"] +'''</a><br>'''
## use pdfannots to extract the highlights
subprocess.call(['pdfannots', filename, '-s','highlights', '-o', 'pdfAnnots_foo.txt'])
txt = Path('pdfAnnots_foo.txt').read_text()
# print(txt)
## replace all the page 1, page 3 , etc from output to be urls instead. Assume maximum of 1000 pages
## Has to be reversed otherwise Page 100 might be mistaken for Page 10 or Page 1
for i in reversed(range(1000)):
string = "Page %d"%(i+1)
string_updated =''' <a href="''' + filename + '''#page=''' + '''%d">Page-%d'''%(i+1,i+1) + '''</a><br>'''
txt = txt.replace(string, string_updated)
# print( markdown2.markdown(txt))
finalNote += "<br>"+markdown2.markdown(txt)
print(finalNote)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment