Last active
March 10, 2020 16:31
-
-
Save runsun/9bd4a516269c185ab9a9c5ef2ed26a3d to your computer and use it in GitHub Desktop.
Process Pdf Bookmarks Extracted by JPdfBookmarks
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
###################################################################### | |
## process_pdf_bookmarks.py | |
## by Runsun Pan, 20190310 | |
## | |
## Process pdf bookmarks extracted from a pdf file by JPdfBookmarks | |
## JPdfBookmarks version: 2.5.2 | |
# | |
## The bookmark file dumped by JPdfBookmarks has lots of styling | |
## codes that shouldn't be there. This py takes care of that. Other | |
## features included. | |
## | |
## Features: | |
## | |
## 1. Remove styling codes of a dumped pdf bookmark file | |
## 2. Allows for output of outline at customized level | |
## 3. Can adjust indent width | |
## 4. Can adjust the line spacing above and below header | |
## | |
## | |
## How: | |
## 1. Download and install JPdfBookmarks | |
## https://sourceforge.net/projects/jpdfbookmarks/ | |
## 2. Use JPdfBookmarks to open pdf file | |
## 3. Find the button to dump the bookmarks to a txt file | |
## 4. Go to the folder where the txt file is saved, do one of below: | |
## | |
## > py process_pdf_bookmarks.py -f=<filename> <options> | |
## | |
## Where <options> are: | |
## | |
## -l: level of outline. Default: -1. | |
## -s: line spacing above,below the header. Default: "1,1" | |
## -i: number of spaces for each level of indent. Default: 2 | |
## -t: title. Could be multiline if lines are separated by "\\n". Default: "Pdf Bookmarks" | |
## | |
## A file abc.txt will produce either one below: | |
## | |
## abc_details.txt | |
## abc_outlined_n.txt where n = level | |
## | |
###################################################################### | |
__description__ = "Process pdf bookmarks extracted from a pdf file by JPdfBookmarks" | |
__author__= "Runsun Pan" | |
__version__= "200306-1" | |
import argparse | |
#====================================== | |
# argument setup and parse | |
# | |
argp = argparse.ArgumentParser( | |
prog= __file__ | |
,description= __description__+ "\n" + __author__ ) | |
argp.add_argument('--version', action='version' | |
, version='%(prog)s '+__version__ | |
, help='Shows version and exits.') | |
argp.add_argument("-f", metavar="fname", type=str, help="file name" ) | |
argp.add_argument("-i", metavar="indent", type=int, default=2, help="Optional indent. Default=2") | |
argp.add_argument("-l", metavar="level", type=int, default=-1, help="Optional # to indicate outline level. 0= top level only") | |
argp.add_argument("-t", metavar="title", type=str, default="Pdf Bookmarks", help="Optional title for the output file. Default:'Pdf Bookmarks'") | |
argp.add_argument("-s", metavar="headerLineSpacing", type=str, default="1,1", help="Optional header line spacing in the form of 'm,n'. Default: '1,1'") | |
args= argp.parse_args() | |
fname = args.f | |
indent= args.i | |
level = args.l | |
title = args.t | |
headerLineSpacing = args.s | |
# | |
#====================================== | |
# | |
lines = open(fname, "r").readlines() | |
def removeStyles( lines=lines ): | |
''' Remove styling codes. Return a list ''' | |
return [ '/'.join(line.split("/")[:-1])+"/"+line.split("/")[-1].split(",")[0] | |
for line in lines ] | |
def replaceTabs( lines): | |
''' Replace each tab with spaces based on the "indent" argument. Return a list | |
''' | |
sp_indented_lines= [ line.replace("\t"," "*indent)+"\n" for line in lines] | |
x = [] | |
if headerLineSpacing!="0,0": | |
m,n = [int(x) for x in headerLineSpacing.split(',')] | |
for line in sp_indented_lines: | |
if not line.startswith(" "): | |
x.append( "\n"*m + line + "\n"*n ) | |
else: | |
x.append( line ) | |
return x | |
else: | |
return sp_indented_lines | |
def getLines( lines ): | |
''' Return a list containing detailed bookmarks ''' | |
newlines= replaceTabs( removeStyles( lines ) ) | |
return ( level<1 and newlines | |
or [ line for line in newlines | |
if not line.startswith( " "*indent*level ) ] | |
) | |
def save( lines = getLines( lines ), title=title ): | |
#=============================== Prepare output title | |
title = [ '\n'+x for x in title.split('\\n') ] | |
# Attach 'details' or 'outlined_n' to the file name | |
xname = ( level<1 and "details" or ("outlined_"+str(level))) | |
fn = fname.split('.') | |
fn = '.'.join(fn[:-1])+'_'+ xname + '.'+fn[-1] | |
title.append( '\n'+fn ) | |
# Add a separator line "===..." underneath the title | |
title.append( '\n'+"="* ( max( [len(x) for x in title] )-1 )+'\n' ) | |
#=============================== | |
lines = title+ lines | |
open( fn, 'w').writelines( lines ) | |
print('### Save to file: '+fn | |
+ '\n### parameters: ' | |
+ str( {"level":level, 'indent':indent, 'headerLineSpacing':headerLineSpacing} )) | |
save( ) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment