jmbarbier/HTML2PDF.pl

## HTML2PDF.pl
# wrapper for wkhtmltopdf and pdftk to update links with actual page numbers
#  TOC entries get leader dots and page number . . . NN
#  internal links (#id) get [page NN]
# written by Phil M Perry
# (c) copyright 2015, Phil M Perry
# license: GNU Lesser General Public License (LGPL) v3
#
use warnings;
# it's a quick & dirty job that doesn't have many safeguards. use at your own
# risk. anyone is welcome to improve upon it!
#
# input: HTML file to be converted (parameter 1)
#  WARNING: input file will be overwritten several times! save a backup first.
# output: PDF file (parameter 2)
# requires: toPDF.bat, wkhtmltopdf.exe, pdftk.exe
# limitations: each link (<a>...</a>) must be on one line
#              *TOC links (to receive leader dots) include class="toc"
#              any nbsp's used to indent TOC entries must be within link label
#                or else the leader dots and page numbers won't align
#                (and use only 0 or more &nbsp;'s to left pad link label)
#              assumes all links have href= (no name= anchor "links")
#              *link with class="external" do not get [page NN]
#              <A>, </A>, CLASS=, HREF=, etc. not recognized (must be l/c)
#                TBD: could use regex to do case-insensitive matches?
#       * see configuration
#
# configuration ==========================================================
# haven't figured out yet why replacing separate .bat file with inline isn't
#   working... get very cryptic errors. for now, using toPDF.bat
# where to find wkhtmltopdf
##$wkhtmltopdf = "\"C:\\Program Files\\wkhtmltopdf\\bin\\wkhtmltopdf.exe\"";
# parameters for wkhtmltopdf
##$WKparms = "-s Letter " .   # paper size
##           "-B .5in -T .5in -L .5in -R .5in " .  # margins
##	     "--header-right \"page [page]\"" .  # page number at upper right
##  	     "--footer-center \"Confidential, property of someone\"
##           "--header-spacing 4 --footer-spacing 4";

# default class="toc" to mark a TOC entry line
$tocClass = "toc";
# default class="external" to mark a link that is NOT to get [page NN]
# TBD: look at href= and if it starts with #, it's internal (no extClass needed)
$extClass = "external";
# when the (TOC) page number updated with the actual number, it is left
# justified (i.e., does not ovewrite any leader dots), so be sure to leave
# room for the largest page number! for narrower pages, reduce size.
$TOCtemplate = ". . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 9999";
# default page template for internal links, added to end of link label
$page99 = "&nbsp;[page&nbsp;99]";
# name of scratch file for HTML rewrite
$tempHTML = "TEMP.HTML";
# name of scratch file for PDF unpack and analyze (a big file!)
$tempPDF = "TEMP.PDF";
# minimum 2 passes needed. it's rare to go more than 3
$maxLoops = 10;
# 0: leave (modified) inFile, 1: erase it when done
$eraseInFile = 1;
# end configuration ======================================================
#
if ($#ARGV != 1) {
  print "require input and output file names\n";
  exit;
}
$inFile = $ARGV[0];
$outFile = $ARGV[1];
$tempFile = $tempPDF;
#
# initialize multipass variables
@objList = ();
#

# initial update of input HTML file is:
#   TOC <a> (with class="toc") add leader dots and 99 page number
#   <a> links without class="external" add &nbsp;[page&nbsp;99]
#   other <a> (external sites) leave alone
# note that this section will need to be updated for any other formats
# of HTML file being processed
print "adding initial page numbers to input HTML file\n";
initialUpdateHTML($inFile);

for ($loop=1; $loop<=$maxLoops; $loop++) {
  print "pass $loop: create PDF and unpack it\n";

  $pageChange = 0;  # no page change seen yet in this loop
  # make PDF
  system("toPDF $inFile $outFile");
  ##system("$wkhtmltopdf $WKparms $inFile $outFile");

  # uncompress the PDF file
  system("pdftk $outFile output $tempFile uncompress");
  # warning! $tempFile can be huge!

  # tempFile is the readable, uncompressed PDF produced by pdftk
  open(IN, $tempFile) || die "can't open temp file $tempFile\n";
  # go through tempFile and get the lines starting with /file
  # save #id objid objid2 at end
  # we may not need to do this more than once, but if an id moves from
  #   one page to another, it is conceivable that the objid may change from
  #   pass to pass

  print " build list of targeted objects\n";
  while ($line=<IN>) {
    if ($line =~ m#^/file:.*\#23(.*) (.*) (.*) R#) {
      if ($loop == 1) {
        # first time, initialize (build objList)
	# page -1 (uninitialized page number)
	push (@objList, [$1, $2, $3, -1]);
      } else {
	# subsequent pass, update object id if necessary
        for ($i=0; $i<=$#objList; $i++) {
	  if ($objList[$i][0] eq $1) {
	    if ($objList[$i][1] != $2 || $objList[$i][2] != $3) {
	      print "#$1 has changed object id from $objList[$i][1] $objList[$i][2] to $2 $3\n";
	      $objList[$i][1] = $2;
	      $objList[$i][2] = $3;
	    }
	  }
	}
      }
    } # end processing a /file: line
  } # end of loop through lines in tempFile
  seek(IN, 0, 0); # we'll read tempFile again

  # now read through to find each object id that's in objList, and record
  # its page number. set flag if changed.
  print " update list of targeted objects with their page numbers\n";
  $page = 0;
  while ($line=<IN>) {
    if ($line =~ m#^/pdftk_PageNum (.*)#) {
      $page = $1;
      next;
    }

    if ($line =~ m#^(.*) (.*) obj #) {
      # found an obj... is it in objList?
      for ($i=0; $i<=$#objList; $i++) {
        if ($objList[$i][1] == $1 && $objList[$i][2] == $2) {
	  if ($objList[$i][3] != $page) {
	    if ($loop > 1) {
	      print "#$objList[$i][0] has changed page number from $objList[$i][3] to $page\n";
	    }
	    $objList[$i][3] = $page;
	    $pageChange = 1;
	    last; # exit for loop, since this line has been processed
	  }
	}
      }

    } # end of processing an object line
  } # end of loop through lines in tempFile

  close(IN);
  unlink($tempFile);
  if (!$pageChange) { last; } # no page numbers changed? done!

  # note: we are not checking if an id disappeared or was new in later passes...
  # assume that set of ids is constant, and that object id and page may change

  # final step in loop, if page numbers have changed, is to update the input
  # HTML file's page numbers.
  print "update page numbers in links and TOC\n";
  updateHTML($inFile, \@objList);

  # too many loops?
  if ($loop == $maxLoops) {
    print "Page numbering did not settle down within $maxLoops passes.\n";
    print "Use output PDF file with caution.\n";
    last;
  }
} # 2+ loops to create PDF/update HTML and repeat
if ($eraseInFile) {
  unlink($inFile); # since inFile has been modified, OK to erase
}
# end of main program

# ==============================================================
# add to TOC entries leader dots and "9" at fixed length entry
# add to other <a> without class="external"  &nbsp;[page&nbsp;99]
# note: <A> and </A> are not matched!
#
# could use a page number other than 99, such as 999 for large documents
# fixed length of TOC entry (number of leader dots) could be varied, depending
#   on desired page width (TOC will be printed with fixed pitch font to ensure
#   that dots align horizontally)
sub initialUpdateHTML {
  my $inFile = shift;

  my $tempFile = $tempHTML;
  my $me = "initialUpdateHTML()";
  my ($line, $pos, $pos2, $tocLine, $label, $len, $i);

  open(IN, $inFile) || die "$me can't open input file $inFile\n";
  open(OUT, ">$tempFile") || die "$me can't open output file $tempFile\n";

  while ($line=<IN>) {
    chomp $line;
    # assume there might be more than one <a> in a line
    $pos = 0;
    while (($pos2 = index($line, "<a ", $pos)) >= 0) {
      # there IS a[nother] <a> in this line
      $pos = $pos2;
      $pos2 = index($line, "</a>", $pos) + 3; # pos..pos2 s/b entire <a>
      if ($pos2 <= $pos) {
	print "line =>$line<=\n seems to be missing its </a>!\n";
	die "This program cannot continue until HTML source is fixed.\n";
      }
      if (index(substr($line, $pos, $pos2-$pos+1), "class=\"$tocClass\"") >= 0) {
        # between pos and pos2, is there class="toc"? if so, is TOC
	$pos = index($line, ">", $pos)+1; # start of link label text
	$pos2 = index($line, "</a>", $pos)-1; # end of link label text
	# at this point pos..pos2 is just a label between <a> and </a>
	$label = substr($line, $pos, $pos2-$pos+1); # raw label
	$tocLine = $TOCtemplate;
	# first, convert any leading &nbsp; in the label to spaces,
	#   remembering the number of &nbsp;'s found
	if ($label =~ m#((&nbsp;)+)#) {
          $len = length($1)/6; # count of non-breaking spaces
	  substr($label, 0, $len*6) = ' ' x $len;
        } else {
	  $len = 0;
	}
	# next, overwrite the template at left with the label+one space (nbsp)
	# pos2-pos+1 is original length of label (with nbsp's)
	# overwrite first part of tocLine by label and non-breaking space
	#  (so label  . . doesn't first get squeezed to label . .)
	$label .= '&nbsp;';
	# finally, restore $len leading blanks to nbsp's
	if ($len > 0) {
	  substr($label, 0, $len) = '&nbsp;' x $len;
	}
	# -5 for trailing nbsp (want to count as one space)
	# -len*5 for any leading nbsp's (want to count as len spaces)
	$tocLine = $label . substr($tocLine, length($label)-5-$len*5);
	$line = substr($line, 0, $pos) . $tocLine . substr($line, $pos2+1);
	$pos2 += length($tocLine) - ($pos2-$pos+1);  # length of added text

      } elsif (index(substr($line, $pos, $pos2-$pos), "class=\"$extClass\"") < 0) {
        # between pos and pos2, is there NOT class="external"? if so, add [page]
	$pos = index($line, ">", $pos)+1; # start of link label text
	$pos2 = index($line, "</a>", $pos)-1; # end of link label text
	$line = substr($line, 0, $pos2+1) . $page99 . substr($line, $pos2+1);
	$pos2 += length($page99);  # length of added text
      }

      $pos = $pos2; # see if there's another <a> to process
    } # done (possibly) updating $line
    print OUT $line."\n"; # output the (sometimes) updated line
  } # reading input HTML file and writing back out modified version

  close(IN);
  close(OUT);

  rename $tempFile, $inFile;
}

# ==============================================================
# update NN page number at end of TOC entries and [page NN] entries
# with current value in objList
#
# unfortunately, there is no reasonable way to determine which page a link
# is on. if there was, we could use "[previous page]" (target's page = link's
# page - 1), "[this page]" (target's page = link's page), and "[next page]"
# (target's page = links's page + 1) instead of always using "[page NN]".
#
sub updateHTML {
  my $inFile = shift;
  my $objListRef = shift;
    my @objList = @$objListRef;

  my $tempFile = $tempHTML;
  my $me = "updateHTML()";
  my ($line, $pos, $pos2, $i, $len);

  # read in HTML, line by line. requires that full <a...>text</a> is all on
  # one line
  open(IN, $inFile) || die "$me can't open input file $inFile\n";
  open(OUT, ">$tempFile") || die "$me can't open output file $tempFile\n";

  while ($line=<IN>) {
    chomp $line;
    # assume there might be more than one <a> in a line
    $pos = 0;
    while (($pos2 = index($line, "<a ", $pos)) >= 0) {
      # there IS a[nother] <a> in this line
      $pos = $pos2;
      $pos2 = index($line, "</a>", $pos) + 3; # pos..pos2 s/b entire <a>
      if ($pos2 <= $pos) {
	print "line =>$line<=\n seems to be missing its </a>!\n";
	die "This program cannot continue until HTML source is fixed.\n";
      }
      if (index(substr($line, $pos, $pos2-$pos+1), "class=\"$tocClass\"") >= 0) {
        # between pos and pos2, is there class="toc"? if so, is TOC
	# find href="#id"
	$pos2 = index($line, "href=", $pos); # assume there IS an href
	substr($line, $pos2) =~ m/href="#([^"]+)"/;
	for ($i=0; $i<=$#objList; $i++) {
	  if ($objList[$i][0] eq $1) { last; } # $i is objList row number
	}

	$pos = index($line, ">", $pos)+1; # start of link label text
	$pos2 = index($line, "</a>", $pos)-1; # end of link label text
	# pos..pos2 is full label between <a> and </a>
	# we're interested in finding the number at the end and overwriting it
	$pos = rindex($line, ' ', $pos2)+1;
	# pos..pos2 should be the old page number \d+
	$line = substr($line, 0, $pos) . $objList[$i][3] . substr($line, $pos2+1);
	$pos2 = index($line, "</a>", $pos);  # may have changed slightly

      } elsif (index(substr($line, $pos, $pos2-$pos), "class=\"$extClass\"") < 0) {
        # between pos and pos2, is there NOT class="external"? if so,
	# update page number
	$pos2 = index($line, "href=", $pos);
	substr($line, $pos2) =~ m/href="#([^"]+)"/;
	for ($i=0; $i<=$#objList; $i++) {
	  if ($objList[$i][0] eq $1) { last; } # $i is index in objList
	}

	$pos = index($line, ">", $pos)+1; # start of link label text
	$pos2 = index($line, "</a>", $pos)-1; # end of link label text ']'
	$pos = rindex($line, "&nbsp;", $pos2)+6; # pos = start of page number
	$line = substr($line, 0, $pos) . $objList[$i][3] . substr($line, $pos2);
	$pos2 = index($line, "</a>", $pos);
      }

      $pos = $pos2; # see if there's another <a> to process
    } # done (possibly) updating $line
    print OUT $line."\n"; # output the (sometimes) updated line
  } # reading input HTML file and writing back out modified version

  close(IN);
  close(OUT);

  rename $tempFile, $inFile;
}

## toPDF.bat
echo off
set input=%1
set output=%2
"c:\Program Files\wkhtmltopdf\bin\wkhtmltopdf.exe" -s Letter -B .5in -T .5in -L .5in -R .5in --header-right "page [page]" --footer-center "Confidential, property of someone" --header-spacing 4 --footer-spacing 4 %input% %output%
	# wrapper for wkhtmltopdf and pdftk to update links with actual page numbers
	# TOC entries get leader dots and page number . . . NN
	# internal links (#id) get [page NN]
	# written by Phil M Perry
	# (c) copyright 2015, Phil M Perry
	# license: GNU Lesser General Public License (LGPL) v3
	#
	use warnings;
	# it's a quick & dirty job that doesn't have many safeguards. use at your own
	# risk. anyone is welcome to improve upon it!
	#
	# input: HTML file to be converted (parameter 1)
	# WARNING: input file will be overwritten several times! save a backup first.
	# output: PDF file (parameter 2)
	# requires: toPDF.bat, wkhtmltopdf.exe, pdftk.exe
	# limitations: each link (<a>...</a>) must be on one line
	# *TOC links (to receive leader dots) include class="toc"
	# any nbsp's used to indent TOC entries must be within link label
	# or else the leader dots and page numbers won't align
	# (and use only 0 or more  's to left pad link label)
	# assumes all links have href= (no name= anchor "links")
	# *link with class="external" do not get [page NN]
	# <A>, </A>, CLASS=, HREF=, etc. not recognized (must be l/c)
	# TBD: could use regex to do case-insensitive matches?
	# * see configuration
	#
	# configuration ==========================================================
	# haven't figured out yet why replacing separate .bat file with inline isn't
	# working... get very cryptic errors. for now, using toPDF.bat
	# where to find wkhtmltopdf
	##$wkhtmltopdf = "\"C:\\Program Files\\wkhtmltopdf\\bin\\wkhtmltopdf.exe\"";
	# parameters for wkhtmltopdf
	##$WKparms = "-s Letter " . # paper size
	## "-B .5in -T .5in -L .5in -R .5in " . # margins
	## "--header-right \"page [page]\"" . # page number at upper right
	## "--footer-center \"Confidential, property of someone\"
	## "--header-spacing 4 --footer-spacing 4";

	# default class="toc" to mark a TOC entry line
	$tocClass = "toc";
	# default class="external" to mark a link that is NOT to get [page NN]
	# TBD: look at href= and if it starts with #, it's internal (no extClass needed)
	$extClass = "external";
	# when the (TOC) page number updated with the actual number, it is left
	# justified (i.e., does not ovewrite any leader dots), so be sure to leave
	# room for the largest page number! for narrower pages, reduce size.
	$TOCtemplate = ". . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 9999";
	# default page template for internal links, added to end of link label
	$page99 = " [page 99]";
	# name of scratch file for HTML rewrite
	$tempHTML = "TEMP.HTML";
	# name of scratch file for PDF unpack and analyze (a big file!)
	$tempPDF = "TEMP.PDF";
	# minimum 2 passes needed. it's rare to go more than 3
	$maxLoops = 10;
	# 0: leave (modified) inFile, 1: erase it when done
	$eraseInFile = 1;
	# end configuration ======================================================
	#
	if ($#ARGV != 1) {
	print "require input and output file names\n";
	exit;
	}
	$inFile = $ARGV[0];
	$outFile = $ARGV[1];
	$tempFile = $tempPDF;
	#
	# initialize multipass variables
	@objList = ();
	#

	# initial update of input HTML file is:
	# TOC <a> (with class="toc") add leader dots and 99 page number
	# <a> links without class="external" add  [page 99]
	# other <a> (external sites) leave alone
	# note that this section will need to be updated for any other formats
	# of HTML file being processed
	print "adding initial page numbers to input HTML file\n";
	initialUpdateHTML($inFile);

	for ($loop=1; $loop<=$maxLoops; $loop++) {
	print "pass $loop: create PDF and unpack it\n";

	$pageChange = 0; # no page change seen yet in this loop
	# make PDF
	system("toPDF $inFile $outFile");
	##system("$wkhtmltopdf $WKparms $inFile $outFile");

	# uncompress the PDF file
	system("pdftk $outFile output $tempFile uncompress");
	# warning! $tempFile can be huge!

	# tempFile is the readable, uncompressed PDF produced by pdftk
	open(IN, $tempFile) \|\| die "can't open temp file $tempFile\n";
	# go through tempFile and get the lines starting with /file
	# save #id objid objid2 at end
	# we may not need to do this more than once, but if an id moves from
	# one page to another, it is conceivable that the objid may change from
	# pass to pass

	print " build list of targeted objects\n";
	while ($line=<IN>) {
	if ($line =~ m#^/file:.\#23(.) (.) (.) R#) {
	if ($loop == 1) {
	# first time, initialize (build objList)
	# page -1 (uninitialized page number)
	push (@objList, [$1, $2, $3, -1]);
	} else {
	# subsequent pass, update object id if necessary
	for ($i=0; $i<=$#objList; $i++) {
	if ($objList[$i][0] eq $1) {
	if ($objList[$i][1] != $2 \|\| $objList[$i][2] != $3) {
	print "#$1 has changed object id from $objList[$i][1] $objList[$i][2] to $2 $3\n";
	$objList[$i][1] = $2;
	$objList[$i][2] = $3;
	}
	}
	}
	}
	} # end processing a /file: line
	} # end of loop through lines in tempFile
	seek(IN, 0, 0); # we'll read tempFile again

	# now read through to find each object id that's in objList, and record
	# its page number. set flag if changed.
	print " update list of targeted objects with their page numbers\n";
	$page = 0;
	while ($line=<IN>) {
	if ($line =~ m#^/pdftk_PageNum (.*)#) {
	$page = $1;
	next;
	}

	if ($line =~ m#^(.) (.) obj #) {
	# found an obj... is it in objList?
	for ($i=0; $i<=$#objList; $i++) {
	if ($objList[$i][1] == $1 && $objList[$i][2] == $2) {
	if ($objList[$i][3] != $page) {
	if ($loop > 1) {
	print "#$objList[$i][0] has changed page number from $objList[$i][3] to $page\n";
	}
	$objList[$i][3] = $page;
	$pageChange = 1;
	last; # exit for loop, since this line has been processed
	}
	}
	}

	} # end of processing an object line
	} # end of loop through lines in tempFile

	close(IN);
	unlink($tempFile);
	if (!$pageChange) { last; } # no page numbers changed? done!

	# note: we are not checking if an id disappeared or was new in later passes...
	# assume that set of ids is constant, and that object id and page may change

	# final step in loop, if page numbers have changed, is to update the input
	# HTML file's page numbers.
	print "update page numbers in links and TOC\n";
	updateHTML($inFile, \@objList);

	# too many loops?
	if ($loop == $maxLoops) {
	print "Page numbering did not settle down within $maxLoops passes.\n";
	print "Use output PDF file with caution.\n";
	last;
	}
	} # 2+ loops to create PDF/update HTML and repeat
	if ($eraseInFile) {
	unlink($inFile); # since inFile has been modified, OK to erase
	}
	# end of main program

	# ==============================================================
	# add to TOC entries leader dots and "9" at fixed length entry
	# add to other <a> without class="external"  [page 99]
	# note: <A> and </A> are not matched!
	#
	# could use a page number other than 99, such as 999 for large documents
	# fixed length of TOC entry (number of leader dots) could be varied, depending
	# on desired page width (TOC will be printed with fixed pitch font to ensure
	# that dots align horizontally)
	sub initialUpdateHTML {
	my $inFile = shift;

	my $tempFile = $tempHTML;
	my $me = "initialUpdateHTML()";
	my ($line, $pos, $pos2, $tocLine, $label, $len, $i);

	open(IN, $inFile) \|\| die "$me can't open input file $inFile\n";
	open(OUT, ">$tempFile") \|\| die "$me can't open output file $tempFile\n";

	while ($line=<IN>) {
	chomp $line;
	# assume there might be more than one <a> in a line
	$pos = 0;
	while (($pos2 = index($line, "<a ", $pos)) >= 0) {
	# there IS a[nother] <a> in this line
	$pos = $pos2;
	$pos2 = index($line, "</a>", $pos) + 3; # pos..pos2 s/b entire <a>
	if ($pos2 <= $pos) {
	print "line =>$line<=\n seems to be missing its </a>!\n";
	die "This program cannot continue until HTML source is fixed.\n";
	}
	if (index(substr($line, $pos, $pos2-$pos+1), "class=\"$tocClass\"") >= 0) {
	# between pos and pos2, is there class="toc"? if so, is TOC
	$pos = index($line, ">", $pos)+1; # start of link label text
	$pos2 = index($line, "</a>", $pos)-1; # end of link label text
	# at this point pos..pos2 is just a label between <a> and </a>
	$label = substr($line, $pos, $pos2-$pos+1); # raw label
	$tocLine = $TOCtemplate;
	# first, convert any leading   in the label to spaces,
	# remembering the number of  's found
	if ($label =~ m#(( )+)#) {
	$len = length($1)/6; # count of non-breaking spaces
	substr($label, 0, $len*6) = ' ' x $len;
	} else {
	$len = 0;
	}
	# next, overwrite the template at left with the label+one space (nbsp)
	# pos2-pos+1 is original length of label (with nbsp's)
	# overwrite first part of tocLine by label and non-breaking space
	# (so label . . doesn't first get squeezed to label . .)
	$label .= ' ';
	# finally, restore $len leading blanks to nbsp's
	if ($len > 0) {
	substr($label, 0, $len) = ' ' x $len;
	}
	# -5 for trailing nbsp (want to count as one space)
	# -len*5 for any leading nbsp's (want to count as len spaces)
	$tocLine = $label . substr($tocLine, length($label)-5-$len*5);
	$line = substr($line, 0, $pos) . $tocLine . substr($line, $pos2+1);
	$pos2 += length($tocLine) - ($pos2-$pos+1); # length of added text

	} elsif (index(substr($line, $pos, $pos2-$pos), "class=\"$extClass\"") < 0) {
	# between pos and pos2, is there NOT class="external"? if so, add [page]
	$pos = index($line, ">", $pos)+1; # start of link label text
	$pos2 = index($line, "</a>", $pos)-1; # end of link label text
	$line = substr($line, 0, $pos2+1) . $page99 . substr($line, $pos2+1);
	$pos2 += length($page99); # length of added text
	}

	$pos = $pos2; # see if there's another <a> to process
	} # done (possibly) updating $line
	print OUT $line."\n"; # output the (sometimes) updated line
	} # reading input HTML file and writing back out modified version

	close(IN);
	close(OUT);

	rename $tempFile, $inFile;
	}

	# ==============================================================
	# update NN page number at end of TOC entries and [page NN] entries
	# with current value in objList
	#
	# unfortunately, there is no reasonable way to determine which page a link
	# is on. if there was, we could use "[previous page]" (target's page = link's
	# page - 1), "[this page]" (target's page = link's page), and "[next page]"
	# (target's page = links's page + 1) instead of always using "[page NN]".
	#
	sub updateHTML {
	my $inFile = shift;
	my $objListRef = shift;
	my @objList = @$objListRef;

	my $tempFile = $tempHTML;
	my $me = "updateHTML()";
	my ($line, $pos, $pos2, $i, $len);

	# read in HTML, line by line. requires that full <a...>text</a> is all on
	# one line
	open(IN, $inFile) \|\| die "$me can't open input file $inFile\n";
	open(OUT, ">$tempFile") \|\| die "$me can't open output file $tempFile\n";

	while ($line=<IN>) {
	chomp $line;
	# assume there might be more than one <a> in a line
	$pos = 0;
	while (($pos2 = index($line, "<a ", $pos)) >= 0) {
	# there IS a[nother] <a> in this line
	$pos = $pos2;
	$pos2 = index($line, "</a>", $pos) + 3; # pos..pos2 s/b entire <a>
	if ($pos2 <= $pos) {
	print "line =>$line<=\n seems to be missing its </a>!\n";
	die "This program cannot continue until HTML source is fixed.\n";
	}
	if (index(substr($line, $pos, $pos2-$pos+1), "class=\"$tocClass\"") >= 0) {
	# between pos and pos2, is there class="toc"? if so, is TOC
	# find href="#id"
	$pos2 = index($line, "href=", $pos); # assume there IS an href
	substr($line, $pos2) =~ m/href="#([^"]+)"/;
	for ($i=0; $i<=$#objList; $i++) {
	if ($objList[$i][0] eq $1) { last; } # $i is objList row number
	}

	$pos = index($line, ">", $pos)+1; # start of link label text
	$pos2 = index($line, "</a>", $pos)-1; # end of link label text
	# pos..pos2 is full label between <a> and </a>
	# we're interested in finding the number at the end and overwriting it
	$pos = rindex($line, ' ', $pos2)+1;
	# pos..pos2 should be the old page number \d+
	$line = substr($line, 0, $pos) . $objList[$i][3] . substr($line, $pos2+1);
	$pos2 = index($line, "</a>", $pos); # may have changed slightly

	} elsif (index(substr($line, $pos, $pos2-$pos), "class=\"$extClass\"") < 0) {
	# between pos and pos2, is there NOT class="external"? if so,
	# update page number
	$pos2 = index($line, "href=", $pos);
	substr($line, $pos2) =~ m/href="#([^"]+)"/;
	for ($i=0; $i<=$#objList; $i++) {
	if ($objList[$i][0] eq $1) { last; } # $i is index in objList
	}

	$pos = index($line, ">", $pos)+1; # start of link label text
	$pos2 = index($line, "</a>", $pos)-1; # end of link label text ']'
	$pos = rindex($line, " ", $pos2)+6; # pos = start of page number
	$line = substr($line, 0, $pos) . $objList[$i][3] . substr($line, $pos2);
	$pos2 = index($line, "</a>", $pos);
	}

	$pos = $pos2; # see if there's another <a> to process
	} # done (possibly) updating $line
	print OUT $line."\n"; # output the (sometimes) updated line
	} # reading input HTML file and writing back out modified version

	close(IN);
	close(OUT);

	rename $tempFile, $inFile;
	}
	echo off
	set input=%1
	set output=%2
	"c:\Program Files\wkhtmltopdf\bin\wkhtmltopdf.exe" -s Letter -B .5in -T .5in -L .5in -R .5in --header-right "page [page]" --footer-center "Confidential, property of someone" --header-spacing 4 --footer-spacing 4 %input% %output%