Skip to content

Instantly share code, notes, and snippets.

@runsun
Last active March 10, 2020 16:31
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save runsun/9bd4a516269c185ab9a9c5ef2ed26a3d to your computer and use it in GitHub Desktop.
Save runsun/9bd4a516269c185ab9a9c5ef2ed26a3d to your computer and use it in GitHub Desktop.
Process Pdf Bookmarks Extracted by JPdfBookmarks
######################################################################
## process_pdf_bookmarks.py
## by Runsun Pan, 20190310
##
## Process pdf bookmarks extracted from a pdf file by JPdfBookmarks
## JPdfBookmarks version: 2.5.2
#
## The bookmark file dumped by JPdfBookmarks has lots of styling
## codes that shouldn't be there. This py takes care of that. Other
## features included.
##
## Features:
##
## 1. Remove styling codes of a dumped pdf bookmark file
## 2. Allows for output of outline at customized level
## 3. Can adjust indent width
## 4. Can adjust the line spacing above and below header
##
##
## How:
## 1. Download and install JPdfBookmarks
## https://sourceforge.net/projects/jpdfbookmarks/
## 2. Use JPdfBookmarks to open pdf file
## 3. Find the button to dump the bookmarks to a txt file
## 4. Go to the folder where the txt file is saved, do one of below:
##
## > py process_pdf_bookmarks.py -f=<filename> <options>
##
## Where <options> are:
##
## -l: level of outline. Default: -1.
## -s: line spacing above,below the header. Default: "1,1"
## -i: number of spaces for each level of indent. Default: 2
## -t: title. Could be multiline if lines are separated by "\\n". Default: "Pdf Bookmarks"
##
## A file abc.txt will produce either one below:
##
## abc_details.txt
## abc_outlined_n.txt where n = level
##
######################################################################
__description__ = "Process pdf bookmarks extracted from a pdf file by JPdfBookmarks"
__author__= "Runsun Pan"
__version__= "200306-1"
import argparse
#======================================
# argument setup and parse
#
argp = argparse.ArgumentParser(
prog= __file__
,description= __description__+ "\n" + __author__ )
argp.add_argument('--version', action='version'
, version='%(prog)s '+__version__
, help='Shows version and exits.')
argp.add_argument("-f", metavar="fname", type=str, help="file name" )
argp.add_argument("-i", metavar="indent", type=int, default=2, help="Optional indent. Default=2")
argp.add_argument("-l", metavar="level", type=int, default=-1, help="Optional # to indicate outline level. 0= top level only")
argp.add_argument("-t", metavar="title", type=str, default="Pdf Bookmarks", help="Optional title for the output file. Default:'Pdf Bookmarks'")
argp.add_argument("-s", metavar="headerLineSpacing", type=str, default="1,1", help="Optional header line spacing in the form of 'm,n'. Default: '1,1'")
args= argp.parse_args()
fname = args.f
indent= args.i
level = args.l
title = args.t
headerLineSpacing = args.s
#
#======================================
#
lines = open(fname, "r").readlines()
def removeStyles( lines=lines ):
''' Remove styling codes. Return a list '''
return [ '/'.join(line.split("/")[:-1])+"/"+line.split("/")[-1].split(",")[0]
for line in lines ]
def replaceTabs( lines):
''' Replace each tab with spaces based on the "indent" argument. Return a list
'''
sp_indented_lines= [ line.replace("\t"," "*indent)+"\n" for line in lines]
x = []
if headerLineSpacing!="0,0":
m,n = [int(x) for x in headerLineSpacing.split(',')]
for line in sp_indented_lines:
if not line.startswith(" "):
x.append( "\n"*m + line + "\n"*n )
else:
x.append( line )
return x
else:
return sp_indented_lines
def getLines( lines ):
''' Return a list containing detailed bookmarks '''
newlines= replaceTabs( removeStyles( lines ) )
return ( level<1 and newlines
or [ line for line in newlines
if not line.startswith( " "*indent*level ) ]
)
def save( lines = getLines( lines ), title=title ):
#=============================== Prepare output title
title = [ '\n'+x for x in title.split('\\n') ]
# Attach 'details' or 'outlined_n' to the file name
xname = ( level<1 and "details" or ("outlined_"+str(level)))
fn = fname.split('.')
fn = '.'.join(fn[:-1])+'_'+ xname + '.'+fn[-1]
title.append( '\n'+fn )
# Add a separator line "===..." underneath the title
title.append( '\n'+"="* ( max( [len(x) for x in title] )-1 )+'\n' )
#===============================
lines = title+ lines
open( fn, 'w').writelines( lines )
print('### Save to file: '+fn
+ '\n### parameters: '
+ str( {"level":level, 'indent':indent, 'headerLineSpacing':headerLineSpacing} ))
save( )
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment