apg/man2html.awk

## man2html.awk
### ====================================================================
###  @Awk-file{
###     author          = "Nelson H. F. Beebe",
###     version         = "1.06",
###     date            = "24 October 1997",
###     time            = "21:34:34 MDT",
###     filename        = "man2html.awk",
###     address         = "Center for Scientific Computing
###                        University of Utah
###                        Department of Mathematics, 105 JWB
###                        155 S 1400 E RM 233
###                        Salt Lake City, UT 84112-0090
###                        USA",
###     telephone       = "+1 801 581 5254",
###     FAX             = "+1 801 581 4148",
###     URL             = "http://www.math.utah.edu/~beebe",
###     checksum        = "01400 968 2975 23193",
###     email           = "beebe@math.utah.edu (Internet)",
###     codetable       = "ISO/ASCII",
###     keywords        = "nroff, troff, UNIX manual page",
###     supported       = "yes",
###     docstring       = "This program converts UNIX manual pages
###                        in nroff/troff markup to strictly-conformant
###                        HTML 2.0, 3.0, or 3.2.  [Actually, only two
###                        HTML 3.x entities (`&nbsp;' and `&shy;')
###                        are used, and those rarely; otherwise, the
###                        syntax conforms strictly to HTML 2.0.]
###
###                        Usage:
###                        	nawk -f man2html.awk [HTML=2|3|3.2] \
###                                manpage-file >html-file
###
###                        The single option, HTML=2, HTML=3, or
###                        HTML=3.2, selects the HTML grammar level.
###                        The default is HTML=2.
###
###                        This program is normally run via a shell
###                        wrapper that offers an option for setting the
###                        output file name.  It has been used to
###                        successfully convert entire man-page
###                        collections on several UNIX systems to HTML
###                        form for convenient World-Wide Web browser
###                        access.
###
###                        Of those nroff/troff commands defined in the
###                        -man format used for UNIX manual pages, only
###                        the most commonly-used ones are supported;
###                        unrecognized ones will be warned about, and
###                        preserved as HTML comments in the output.
###
###                        UNIX man pages tend to be written in a
###                        highly-stylized fashion that we apply
###                        heuristics to in order to recover high-level
###                        HTML structure from low-level nroff/troff
###                        markup.  Deviations from conventional
###                        man-page writing practice will likely result
###                        in less-than-perfect translation to HTML.
###
###                        Although there are several other `man2html'
###                        translators available on the Internet, this
###                        one is entirely of my own authorship, with no
###                        code borrowing from anywhere else.
###
###                        The checksum field above contains a CRC-16
###                        checksum as the first value, followed by the
###                        equivalent of the standard UNIX wc (word
###                        count) utility output of lines, words, and
###                        characters.  This is produced by Robert
###                        Solovay's checksum utility.",
###  }
### ====================================================================

BEGIN 		{ initialize() }

/^[.]ie +t +[.]ds/ { getline }	# fall through: next line should be .el

/^[.]el +.ds/	{ define($3); next }

/^[.']\\"/	{ cmd_comment($0); next } # save comments

/^[.]if +n *\\\{/ { cmd_comment($0); next }

/^[.]if +t *\\\{/ { cmd_comment_block($0); next } # convert troff directives to comments

/^ *\\\}/	{ cmd_comment($0); next }

/^[.]if +t/	{ cmd_comment($0); next } # convert troff directives to comments

/^[.]if +n +[.]ds/ { define($4); next }

/^[.]if +n +[.]ti/ { cmd_comment($0); next } # convert nroff spacing directives to comments

/^[.]ie +n +[.]ds/ { define($4); next }

/^[.]SH/	{ cmd_SH(); next }

/^[.]SS/	{ cmd_SS(); next }

/^[.]TH/	{ cmd_TH(); next }

/^[.]B /	{ cmd_B(); next }

/^[.]I /	{ cmd_I(); next }

/^[.]IX /	{ cmd_IX(); next }

/^[.]R /	{ cmd_R(); next }

/^[.]ad/	{ cmd_ad(); next }

/^[.][BIR]$/	{ cmd_BIR(); next }

/^[.]BI /	{ cmd_XY("B","I"); next }

/^[.]br/	{ cmd_br(); next }

/^[.]BR /	{ cmd_XY("B","R"); next }

/^[.]ce[ 0-9]*$/ { cmd_ce(); next }

/^[.]hw/	{ cmd_hw(); next }

/^[.]IB /	{ cmd_XY("I","B"); next }

/^[.]IR /	{ cmd_XY("I","R"); next }

/^[.]ne/	{ cmd_ne(); next }

/^[.]RB /	{ cmd_XY("R","B"); next }

/^[.]RI /	{ cmd_XY("R","I"); next }

/^[.]nf/	{ cmd_nf(); next }

/^[.]fi/	{ cmd_fi(); next }

/^[.]IP/	{ cmd_IP(); next }

/^[.]LP/	{ cmd_LP(); next }

/^[.]na/	{ cmd_na(); next }

/^[.]PP/	{ cmd_PP(); next }

/^[.]RE/	{ cmd_RE(); next }

/^[.]RS/	{ cmd_RS(); next }

/^[.]sp/	{ cmd_sp(); next }

/^[.]TP/	{ cmd_TP(); next }

/^[.]TS/	{ cmd_TS(); next }

/^[.][A-Za-z]/	{ cmd_unknown(); next }

		{ print_line(strtohtml($0)) }

END		{ terminate(); }


# The anchor() function is adapted from my bibtex-to-html.awk file

function anchor(s,type,pattern,offset,prefix,save_label, name,rstart,rlength,save)
{
    # Add anchors <A type="....">...</A> around text in s matching
    # pattern.  A non-zero offset discards that many characters from
    # the start of the match, allowing the pattern to contain leading
    # context which goes outside the anchored region.  The prefix is
    # attached to the start of the matched string, inside the value
    # quotes in the anchor.

    if (match(s,pattern))
    {
	rstart = RSTART		# need private copies of these globals because
	rlength = RLENGTH	# recursion will change them

	rstart += offset	# adjust by offset to discard leading
	rlength -= offset	# context in pattern

	name = substr(s,rstart,rlength)
	sub(/ +at +/,"@",name)	# reduce "user at host" to "user@host"

	s = substr(s,1,rstart-1) \
	    "<A " type "=\"" prefix name "\">" \
	    ((type == "NAME") ? "<STRONG>" : "") \
	    substr(s,rstart,rlength) \
	    ((type == "NAME") ? "</STRONG>" : "") \
	    "</A>" \
	    anchor(substr(s,rstart+rlength),type,pattern,offset,prefix,save)
    }
    return (s)
}


function begin_toc()
{
    print_toc("<H1>")
    print_toc("Table of contents")
    print_toc("</H1>")
    print_toc("<UL>")
    In_TOC_Item = 0
}


function cmd_ad()
{				# .ad: turn on adjust (flush-left-and-right justification)
    cmd_comment($0)		# no HTML equivalent
}


function cmd_B( s)
{
    end_font()
    if (match($0,/^[.]B *\"/))
    {
	s = substr($0,RSTART+RLENGTH)
	gsub(/[" ]*$/,"",s)
	print_line("<STRONG>" strtohtml(s) "</STRONG>")
    }
    else
	print_line("<STRONG>" strtohtml($2) "</STRONG>")
}


function cmd_BIR()
{
    end_font()
    print_line(strtohtml("\\f" substr($0,2,1))) # Remap .B into \fB etc
}


function cmd_br()
{
    cmd_PP()
}


function cmd_ce( k,n)
{
    # .ce nnn: turn on centering for next nnn lines (nnn = 0 turns it off)

    n = $2
    cmd_comment($0)
    if (n > 0)
    {
	# The HTML 3.2 grammar supports <CENTER> ... </CENTER> as a
	# shorthand for the more general <DIV ALIGN=CENTER> ... </DIV>
	# (CENTER can be replaced by LEFT or RIGHT).  However, except
	# for amaya (W3C's testbed for HTML 3.2), none of the current
	# browsers support DIV. grail, hotjava, netscape all recognize
	# CENTER. arena, chimera, lynx, and xmosaic do not recognize it
	# either.
	if (HTML == "3.2")
	    print_line("<CENTER>")
	for (k = 1; k <= n; ++k)
	{
	    getline
	    print_line(strtohtml($0) "<BR>")
	}
	if (HTML == "3.2")
	    print_line("</CENTER>")
    }
}


function cmd_comment(s)
{
    In_Comment = 1
    sub(/^[.']\\"/,"",s)	# remove troff comment prefix: it confuses html-pretty
    print_line("<!-- " strtohtml(s) " -->")
    In_Comment = 0
}


function cmd_comment_block(s)
{
    cmd_comment(s)

    In_Comment = 1
    while (getline s > 0)
    {
	cmd_comment(s)
	if (s ~ /^ *\\\}/)
	    break		# found end of block
    }
    In_Comment = 0
}


function cmd_I( s)
{
    end_font()
    if (match($0,/^[.]I *\"/))
    {
	s = substr($0,RSTART+RLENGTH)
	gsub(/[" ]*$/,"",s)
	print_line("<EM>" strtohtml(s) "</EM>")
    }
    else
	print_line("<EM>" strtohtml($2) "</EM>")

}


function cmd_IX()
{
    # .IX index entry lines are simply discarded
    while (match($0,/\\$/) && (getline > 0))
	;			# discard continuation lines
}


function cmd_fi()
{
    end_font()
    if (In_PRE)
    {
	print_line("</PRE>")
	In_PRE = 0
    }
    else
	cmd_comment($0)
}


function cmd_hw()
{				# .hw word-hyph-en-a-tion ex-cep-tions
    cmd_comment($0)
}


function cmd_IP()
{
    end_font()
    PP++
    print_line("<P>")
}


function cmd_LP()
{
    end_font()
    PP++
    print_line("<P>")
}


function cmd_na()
{	# .na: no adjust: turn off flush-left-and-right justification, producing ragged-right
    cmd_comment($0)		# no HTML equivalent
}


function cmd_ne()
{				# .ne dimen: need dimen vertical space before end of page
				# otherwise, force a page break (e.g. to prevent page
				# breaks after headings)
    cmd_comment($0)
}


function cmd_nf()
{
    end_font()
    if (In_PRE)
	cmd_comment($0)
    else
    {
	print_line("<PRE>")
	In_PRE = 1
    }
}


function cmd_PP()
{
    end_font()
    PP++

    if (In_PRE)			# <P> tags are illegal in <PRE>...</PRE> environments
	print_line("")
    else
	print_line("<P>")

    end_TP()
}


function cmd_R( s)
{
    end_font()
    if (match($0,/^[.]R *\"/))
    {
	s = substr($0,RSTART+RLENGTH)
	gsub(/[" ]*$/,"",s)
	print_line(strtohtml(s))
    }
    else
	print_line(strtohtml($2))
}


function cmd_RE()
{
    end_font()
    if (In_PRE)			# should not happen, but some man pages
	cmd_fi()		# are irregular
    while (List_Level > RSE_List_Level[RSE_Level])
	end_TP()
    if (RSE_Level > 0)
	RSE_Level--
    print_line("</BLOCKQUOTE>")
}


function cmd_RS()
{
    end_font()
    RSE_List_Level[++RSE_Level] = List_Level
    List_Level++		# new .TP level too
    print_line("<BLOCKQUOTE>")
}


function cmd_SH( s)
{				# section heading
    cmd_SH_SS("H1")
}


function cmd_SS( s)
{				# subsection heading
    cmd_SH_SS("H2")
}


function cmd_SH_SS(tag, s)
{				# [sub]section heading
    if (!TH_seen)		# should not happen, but some man pages are
	cmd_TH(substr($0,5))	# irregular
    end_font()
    while (RSE_Level > 0)
	cmd_RE()
    while (List_Level > 0)
	end_TP()
    if (tag == "H1")
    {
	H1++
	if (H1 == 1)
	    begin_toc()
	if (H2 > 0)
	{
	    print_toc("</LI>")
	    print_toc("</UL>")
	}
	H2 = 0
	if (H1 > 1)
	    print_line("<HR>")	# a separating horizontal rule is a nice touch
    }
    else if (tag == "H2")
    {
	H2++
    }
    s = substr($0,5)
    sub(/^ *\"/,"",s)
    sub(/\" *$/,"",s)
    s = strtohtml(s)

    SH_SS_count = "." H1
    if (H2 > 0)
	SH_SS_count = SH_SS_count "." H2

    print_line("<" tag ">")
    print_line("<A NAME=\"HDR" SH_SS_count "\">")
    print_line(s)
    print_line("</A>")
    print_line("</" tag ">")

    if (In_TOC_Item && (H2 != 1))
	print_toc("</LI>")
    if (H2 == 1)
	print_toc("<UL>")
    In_TOC_Item = 1
    print_toc("<LI>")
    print_toc("<A HREF=\"#HDR" SH_SS_count "\">")
    print_toc(s)
    print_toc("</A>")
}


function cmd_sp()
{				# .sp nnn: vertical space
    cmd_comment($0)		# no sensible HTML equivalent
}


function cmd_TH( line)
{
    end_font()
    print_line("<HTML>")
    print_line("<HEAD>")
    print_line("<TITLE>")
    line = $0
    while (line ~ /\\$/)
    {
	getline
	line = substr(line,1,length(line)-1) $0
    }
    print_line(strtohtml(substr(line,4)))
    print_line("</TITLE>")
    print_line("<LINK REV=\"made\" HREF=\"mailto:" LOGNAME "@" HOSTNAME "\">")
    print_line("</HEAD>")
    print_line("")
    print_line("<BODY>")
    print_line("")
    TH_seen = 1
}


function cmd_TP()
{
    end_font()
    getline		# this is the item label, usually "\(bu" or ".B ..."
    if (Item_Count[List_Level] == 0) # then first item of new list
    {
	List_Level++
	Item_Count[List_Level] = 0
        if ($0 == "\\(bu")
	{
	    List_Name[List_Level] = "UL"
	    List_Item[List_Level] = "LI"
	}
	else
	{
	    List_Name[List_Level] = "DL"
	    List_Item[List_Level] = "DT"
	}
	if (Item_Count[List_Level] == 0)
	    print_line("<" List_Name[List_Level] ">")
    }
    Item_Count[List_Level]++
    if (List_Name[List_Level] == "DL")
    {
	if (Item_Count[List_Level] > 1)
	    print_line("</DD>")
	print_line("<DT>")
	if ($0 ~ /^[.]B /)
	    cmd_B()
	else if ($0 ~ /^[.]I /)
	    cmd_I()
	else if ($0 ~ /^[.]R /)
	    cmd_R()
	else if ($0 ~ /^[.]BR/)
	    cmd_XY("B","R")
	else if ($0 ~ /^[.]BI/)
	    cmd_XY("B","I")
	else if ($0 ~ /^[.]IB/)
	    cmd_XY("I","B")
	else if ($0 ~ /^[.]IR/)
	    cmd_XY("I","R")
	else if ($0 ~ /^[.]RB/)
	    cmd_XY("R","B")
	else if ($0 ~ /^[.]RI/)
	    cmd_XY("R","I")
	else
	    print_line(strtohtml($0))
	end_font()
	if (In_PRE)		# should not happen, but some man pages
	    cmd_fi()		# are irregular
	print_line("</DT>")
	print_line("<DD>")
    }
    else			# must be <UL> <LI> ... </LI> </UL> type list
    {
	if (Item_Count[List_Level] > 1)
	    print_line("</LI>")
	print_line("<LI>")
    }
}


function cmd_TS( tbl_nroff_cmd)
{
    # Copy the table to a temporary file
    print $0 >TBLFILE
    while (getline > 0)
    {
	print $0 >TBLFILE
	if ($0 ~ /^[.]TE/)	# then end of table found
	    break
    }
    close (TBLFILE)

    # Run tbl, nroff, and col to convert the table to
    # formatted text, and include it as a preformatted
    # environment.
    tbl_nroff_cmd = "tbl " TBLFILE " | nroff -man | col -b"

    print_line("<PRE>")
    while ((tbl_nroff_cmd | getline) > 0)
	print_line(strtohtml($0))
    print_line("</PRE>")
    close (tbl_nroff_cmd)
    delete_file(TBLFILE)
}


function cmd_unknown()
{
    end_font()
    warning("Unrecognized nroff/troff command in [" $0 "] changed to comment")
    cmd_comment($0)
}


function cmd_XY(x,y, font,k)
{
    end_font()
    protect_quoted_args()
    for (k = 2; k <= NF; ++k)
    {
	font = Font_Map[(k % 2) ? y : x]
	printf("%s%s%s", html_font_begintag(font), strtohtml(unprotect_quoted_arg($k)), \
	       html_font_endtag(font)) > TMPFILE
    }
    print_line("")
}


function define(name, regexp)
{
    # Typical values:
    # .if n .ds Bi BibTeX
    # .el .ds Bi BibTeX
    # Macro used as \*(Bi, but stored as a regexp
    regexp = "\\\\\\*\\(" name
    Macro[regexp] = substr($0,index($0,name)+3)
}


function delete_file(s)
{
    system("/bin/rm -f " s)
}


function end_font()
{
    for (; Font_Level > 0; Font_Level--)
	print_line(html_font_endtag(HTML_Font_Name[Font_Level]))
}


function end_toc()
{
    print_toc("</LI>")
    print_toc("</UL>")
    print_toc("<HR>")
    close (TOCFILE)
}


function end_TP()
{
    if (Item_Count[List_Level] > 0)
    {
	if (List_Name[List_Level] == "DL")
	{
	    print_line("</DD>")
	    print_line("</DL>")
	}
	else
	{
	    print_line("</LI>")
	    print_line("</UL>")
	}
    }
    Item_Count[List_Level] = 0
    if (List_Level > 0)
	List_Level--
}


function font_sub(s, tag)
{
    while (match(s,/\\f[BCIPRST]/))
    {
	if (substr(s,RSTART+2,1) == "P") # revert to previous font
	{
	    tag = html_font_endtag(HTML_Font_Name[Font_Level])
	    if (Font_Level > 0)
		Font_Level--
	}
	else			# set explicit font
	{
	    Font_Level++
	    HTML_Font_Name[Font_Level] = Font_Map[substr(s,RSTART+2,1)]
	    tag = html_font_begintag(HTML_Font_Name[Font_Level])
	    # Handle ...\fB...\fR... style by ending previous font
	    if (Font_Level > 1)
	    {
		tag = html_font_endtag(HTML_Font_Name[Font_Level-1]) tag
		HTML_Font_Name[Font_Level-1] = HTML_Font_Name[Font_Level]
		Font_Level--
	    }
	}
	s = substr(s,1,RSTART-1) tag substr(s,RSTART+3)
    }
    return (s)
}


function html_font_begintag(name)
{
    if (name == "")
	return ""
    else
	return "<" name ">"
}


function html_font_endtag(name)
{
    if (name == "")
	return ""
    else
	return "</" name ">"
}


function initialize()
{
    # Change these two lines whenever the program is modified
    VERSION_NUMBER = "1.06"
    VERSION_DATE = "[24-Oct-1997]"

    VERSION = "Version " VERSION_NUMBER " " VERSION_DATE

    "echo $LOGNAME" | getline LOGNAME
    "hostname" | getline HOSTNAME
    "date" | getline DATE

    if (HTML == "")
	HTML = 2
    if ((HTML != 2) && (HTML != 3) && (HTML != "3.2"))
    {
        warning("Unsupported HTML level " HTML " requested: defaulting to HTML level 2")
	HTML = 2
    }

    Font_Map["B"] = "STRONG"
    Font_Map["C"] = "TT"
    Font_Map["I"] = "EM"
    Font_Map["R"] = ""
    Font_Map["S"] = ""		# cannot map symbol font yet
    Font_Map["T"] = "TT"

    Macro["\\\\e"]	= "\\"
    if (HTML == 2)
	Macro["\\\\0"]	= "\\&#160;"	# change non-breakable space to numeric entity
    else if (HTML >= 3)
	Macro["\\\\0"]	= "\\&nbsp;"	# can finally use named entity
    else
	warning("No conversion implemented for \\\\0 (non-breakable space) in HTML level", HTML)

    TOCFILE = "/tmp/man2html.toc"
    TBLFILE = "/tmp/man2html.tbl"
    TMPFILE = "/tmp/man2html.tmp"
    H1 = 0
    H2 = 0

    Macro["\\\\\\(bu"]	= "\\&#164;"
    Macro["\\\\\\(em"]	= "---"
    Macro["\\\\\\(en"]	= "--"

    # The following fragment for setting URL_xxx variables
    # is borrowed intact from my bibtex-to-html.awk file:
    #
    # According to Internet RFC 1614 (May 1994), a URL is
    # defined in the document T. Berners-Lee, ``Uniform
    # Resource Locators'', March 1993, available at URL
    # ftp://info.cern.ch/pub/ietf/url4.ps.  Unfortunately,
    # that address is no longer valid.  However, I was able to
    # track down pointers from http://www.w3.org/ to locate a
    # suitable description in Internet RFC 1630 (June 1994).

    # NB: We additionally disallow & in a URL because it is
    # needed in SGML entities "&name;".  We also disallow =
    # and | because these are commonly used in \path=...= and
    # \path|...| strings in BibTeX files.  These restrictions
    # could be removed if we went to the trouble of first
    # encoding these special characters in %xy hexadecimal
    # format, but they are rare enough that I am not going to
    # do so for now.  The worst that will happen from this
    # decision is that an occasional URL in a BibTeX file will
    # be missing a surrounding anchor.

    # Bug fix [24-Oct-1997]: Add < and > to the set of excluded
    # characters, to avoid incorrectly including SGML markup inside a
    # URL.  Before this fix, "\fChttp://www/\fP" got translated
    # incorrectly to
    #     <TT><A HREF="http://www/</TT>">http://www/</TT></A>
    # instead of the correct
    #     <TT><A HREF="http://www">http://www</A></TT>

    URL_PATTERN = "[A-Za-z]+://[^ \",&=|<>]+"
    URL_OFFSET = 0
    URL_PREFIX = ""
    URL_SAVE_LABEL = 0

    E_MAIL_PATTERN = "[A-Za-z0-9_-]+@[A-Za-z0-9-]+([.][A-Za-z0-9-]+)*"
    E_MAIL_OFFSET = 0
    E_MAIL_PREFIX = "mailto:"
    E_MAIL_SAVE_LABEL = 0

    print_header()
}


function print_header()
{
    print_line("<!-- Warning: Do NOT edit this file. -->")
    print_line("<!-- It was created automatically by man2html.awk " VERSION " on " DATE " -->")
    print_line("<!-- from the file " strtohtml(FILENAME) " at " HOSTNAME " -->")
    print_line("")

    if (HTML == 2)
	print_line("<!DOCTYPE HTML public \"-//IETF//DTD HTML//EN\">")
    else if (HTML == 3)	# We need level 3 HTML only because of our use of &nbsp; and &shy;
	print_line("<!DOCTYPE HTML public \"-//IETF//DTD HTML 3.0//EN\">")
    else if (HTML == "3.2")	# HTML 3.2 released 5-Nov-1996 at http://www.w3.org/pub/WWW
	print_line("<!DOCTYPE HTML public \"-//W3C//DTD HTML 3.2//EN\">")
}


function print_line(s)
{
    print s >TMPFILE
}


function print_toc(s)
{
    print s >TOCFILE
}


function protect_quoted_args( inside,k,s)
{
    if (index($0,"\"") == 0)
	return
    s = $0
    inside = 0
    for (k = 1; k <= length(s); ++k)
    {
	if (substr(s,k,1) == "\"")
	    inside = !inside
	else if (inside && (substr(s,k,1) == " "))
	    s = substr(s,1,k-1) "\177" substr(s,k+1)
    }
    $0 = s
}


function strtohtml(s, name)
{
    gsub(/\\$/,"",s)		# discard backslash-newline
    gsub(/\\-/,"-",s)		# show troff minus as ASCII minus
    gsub(/\\[&]/,"",s)		# remove no-op macros
    # gsub(/\\[|]/," ",s)	# change thin space to space
    gsub(/\\[|]/,"",s)		# delete thin space (nroff does too)

    gsub(/[&]/,"\\&amp;",s)	# protect 3 or 4
    gsub(/</,"\\&lt;",s)	# special SGML
    gsub(/>/,"\\&gt;",s)	# characters

    if (HTML == 2)
    {
	gsub(/\\ /,"\\&#160;",s)# represent literal space by numeric entity
	gsub(/\\%/,"",s)	# squeeze out discretionary hyphens
    }
    else if (HTML >= 3)
    {
	gsub(/\\ /,"\\&nbsp;",s) # preserve literal spaces

	# NB: several browers fail to implement soft hyphen properly: they show
	# it as an explicit hyphen when the word is not broken at end of line,
	# instead of discarding it.  We translate it correctly, and hope that
	# broken browsers eventually get fixed, sigh...

	gsub(/\\%/,"\\&shy;",s)	# discretionary hyphen -> soft hyphen
    }
    if (In_Comment)
	gsub(/--/,"__",s)	# must hide -- pairs to avoid grammar error
    else if (HTML == "3.2")
	gsub(/\"/,"\\&#34;",s)	# &quot; was left out of HTML 3.2, sigh...
    else
	gsub(/\"/,"\\&quot;",s)	# but other versions, and SGML, have &quot;

    # It is curious that browsers can display a bullet, but there is no
    # HTML markup to represent it, and it is absent from the standard
    # ISO8859-1 fonts
    # gsub(/\\\(bu/,"\\&#164;",s)	# change bullets to general currency sign
				# &curren; but use numeric code because
				# xmosaic does not recognize it

    for (name in Macro)		# substitute macro names
	gsub(name,Macro[name],s)

    s = font_sub(s)

    gsub(/\\\\/,"\\",s)		# reduce troff doubled backslash to single HTML one

#    if (index(s,"\\") > 0)	# check for anything we missed
#	warning("Possible unrecognized nroff/troff markup in [" s "]")

    if (!In_Comment)		# no link inside comment; otherwise, browser shows text
    {
	s = anchor(s,"HREF",URL_PATTERN,URL_OFFSET,URL_PREFIX,URL_SAVE_LABEL)
	s = anchor(s,"HREF",E_MAIL_PATTERN,E_MAIL_OFFSET,E_MAIL_PREFIX, \
		   E_MAIL_SAVE_LABEL)
    }

    return (s)
}


function terminate( x,y)
{
    print_line("</BODY>")
    print_line("</HTML>")
    close (TMPFILE)
    end_toc()

    while (getline x < TMPFILE > 0)
    {
        if (x == "<H1>")
	    break
	print x
    }

    while (getline y < TOCFILE > 0)
	print y
    close (TOCFILE)
    delete_file(TOCFILE)

    print x
    while (getline x < TMPFILE > 0)
	print x
    close (TMPFILE)
    delete_file(TMPFILE)
}


function unprotect_quoted_arg(s)
{
    sub(/^"/,"",s)		# remove leading and
    sub(/"$/,"",s)		# trailing quotes and
    gsub(/\177/," ",s)		# restore spaces
    return (s)
}


function warning(message)
{
    print FILENAME ":" FNR ":%%" message >"/dev/stderr"
}
### ====================================================================
###  @Awk-file{
###     author          = "Nelson H. F. Beebe",
###     version         = "1.06",
###     date            = "24 October 1997",
###     time            = "21:34:34 MDT",
###     filename        = "man2html.awk",
###     address         = "Center for Scientific Computing
###                        University of Utah
###                        Department of Mathematics, 105 JWB
###                        155 S 1400 E RM 233
###                        Salt Lake City, UT 84112-0090
###                        USA",
###     telephone       = "+1 801 581 5254",
###     FAX             = "+1 801 581 4148",
###     URL             = "http://www.math.utah.edu/~beebe",
###     checksum        = "01400 968 2975 23193",
###     email           = "beebe@math.utah.edu (Internet)",
###     codetable       = "ISO/ASCII",
###     keywords        = "nroff, troff, UNIX manual page",
###     supported       = "yes",
###     docstring       = "This program converts UNIX manual pages
###                        in nroff/troff markup to strictly-conformant
###                        HTML 2.0, 3.0, or 3.2.  [Actually, only two
###                        HTML 3.x entities (`&nbsp;' and `&shy;')
###                        are used, and those rarely; otherwise, the
###                        syntax conforms strictly to HTML 2.0.]
###
###                        Usage:
###                        	nawk -f man2html.awk [HTML=2|3|3.2] \
###                                manpage-file >html-file
###
###                        The single option, HTML=2, HTML=3, or
###                        HTML=3.2, selects the HTML grammar level.
###                        The default is HTML=2.
###
###                        This program is normally run via a shell
###                        wrapper that offers an option for setting the
###                        output file name.  It has been used to
###                        successfully convert entire man-page
###                        collections on several UNIX systems to HTML
###                        form for convenient World-Wide Web browser
###                        access.
###
###                        Of those nroff/troff commands defined in the
###                        -man format used for UNIX manual pages, only
###                        the most commonly-used ones are supported;
###                        unrecognized ones will be warned about, and
###                        preserved as HTML comments in the output.
###
###                        UNIX man pages tend to be written in a
###                        highly-stylized fashion that we apply
###                        heuristics to in order to recover high-level
###                        HTML structure from low-level nroff/troff
###                        markup.  Deviations from conventional
###                        man-page writing practice will likely result
###                        in less-than-perfect translation to HTML.
###
###                        Although there are several other `man2html'
###                        translators available on the Internet, this
###                        one is entirely of my own authorship, with no
###                        code borrowing from anywhere else.
###
###                        The checksum field above contains a CRC-16
###                        checksum as the first value, followed by the
###                        equivalent of the standard UNIX wc (word
###                        count) utility output of lines, words, and
###                        characters.  This is produced by Robert
###                        Solovay's checksum utility.",
###  }
### ====================================================================

BEGIN 		{ initialize() }

/^[.]ie +t +[.]ds/ { getline }	# fall through: next line should be .el

/^[.]el +.ds/	{ define($3); next }

/^[.']\\"/	{ cmd_comment($0); next } # save comments

/^[.]if +n *\\\{/ { cmd_comment($0); next }

/^[.]if +t *\\\{/ { cmd_comment_block($0); next } # convert troff directives to comments

/^ *\\\}/	{ cmd_comment($0); next }

/^[.]if +t/	{ cmd_comment($0); next } # convert troff directives to comments

/^[.]if +n +[.]ds/ { define($4); next }

/^[.]if +n +[.]ti/ { cmd_comment($0); next } # convert nroff spacing directives to comments

/^[.]ie +n +[.]ds/ { define($4); next }

/^[.]SH/	{ cmd_SH(); next }

/^[.]SS/	{ cmd_SS(); next }

/^[.]TH/	{ cmd_TH(); next }

/^[.]B /	{ cmd_B(); next }

/^[.]I /	{ cmd_I(); next }

/^[.]IX /	{ cmd_IX(); next }

/^[.]R /	{ cmd_R(); next }

/^[.]ad/	{ cmd_ad(); next }

/^[.][BIR]$/	{ cmd_BIR(); next }

/^[.]BI /	{ cmd_XY("B","I"); next }

/^[.]br/	{ cmd_br(); next }

/^[.]BR /	{ cmd_XY("B","R"); next }

/^[.]ce[ 0-9]*$/ { cmd_ce(); next }

/^[.]hw/	{ cmd_hw(); next }

/^[.]IB /	{ cmd_XY("I","B"); next }

/^[.]IR /	{ cmd_XY("I","R"); next }

/^[.]ne/	{ cmd_ne(); next }

/^[.]RB /	{ cmd_XY("R","B"); next }

/^[.]RI /	{ cmd_XY("R","I"); next }

/^[.]nf/	{ cmd_nf(); next }

/^[.]fi/	{ cmd_fi(); next }

/^[.]IP/	{ cmd_IP(); next }

/^[.]LP/	{ cmd_LP(); next }

/^[.]na/	{ cmd_na(); next }

/^[.]PP/	{ cmd_PP(); next }

/^[.]RE/	{ cmd_RE(); next }

/^[.]RS/	{ cmd_RS(); next }

/^[.]sp/	{ cmd_sp(); next }

/^[.]TP/	{ cmd_TP(); next }

/^[.]TS/	{ cmd_TS(); next }

/^[.][A-Za-z]/	{ cmd_unknown(); next }

		{ print_line(strtohtml($0)) }

END		{ terminate(); }


# The anchor() function is adapted from my bibtex-to-html.awk file

function anchor(s,type,pattern,offset,prefix,save_label, name,rstart,rlength,save)
{
    # Add anchors <A type="....">...</A> around text in s matching
    # pattern.  A non-zero offset discards that many characters from
    # the start of the match, allowing the pattern to contain leading
    # context which goes outside the anchored region.  The prefix is
    # attached to the start of the matched string, inside the value
    # quotes in the anchor.

    if (match(s,pattern))
    {
	rstart = RSTART		# need private copies of these globals because
	rlength = RLENGTH	# recursion will change them

	rstart += offset	# adjust by offset to discard leading
	rlength -= offset	# context in pattern

	name = substr(s,rstart,rlength)
	sub(/ +at +/,"@",name)	# reduce "user at host" to "user@host"

	s = substr(s,1,rstart-1) \
	    "<A " type "=\"" prefix name "\">" \
	    ((type == "NAME") ? "<STRONG>" : "") \
	    substr(s,rstart,rlength) \
	    ((type == "NAME") ? "</STRONG>" : "") \
	    "</A>" \
	    anchor(substr(s,rstart+rlength),type,pattern,offset,prefix,save)
    }
    return (s)
}


function begin_toc()
{
    print_toc("<H1>")
    print_toc("Table of contents")
    print_toc("</H1>")
    print_toc("<UL>")
    In_TOC_Item = 0
}


function cmd_ad()
{				# .ad: turn on adjust (flush-left-and-right justification)
    cmd_comment($0)		# no HTML equivalent
}


function cmd_B( s)
{
    end_font()
    if (match($0,/^[.]B *\"/))
    {
	s = substr($0,RSTART+RLENGTH)
	gsub(/[" ]*$/,"",s)
	print_line("<STRONG>" strtohtml(s) "</STRONG>")
    }
    else
	print_line("<STRONG>" strtohtml($2) "</STRONG>")
}


function cmd_BIR()
{
    end_font()
    print_line(strtohtml("\\f" substr($0,2,1))) # Remap .B into \fB etc
}


function cmd_br()
{
    cmd_PP()
}


function cmd_ce( k,n)
{
    # .ce nnn: turn on centering for next nnn lines (nnn = 0 turns it off)

    n = $2
    cmd_comment($0)
    if (n > 0)
    {
	# The HTML 3.2 grammar supports <CENTER> ... </CENTER> as a
	# shorthand for the more general <DIV ALIGN=CENTER> ... </DIV>
	# (CENTER can be replaced by LEFT or RIGHT).  However, except
	# for amaya (W3C's testbed for HTML 3.2), none of the current
	# browsers support DIV. grail, hotjava, netscape all recognize
	# CENTER. arena, chimera, lynx, and xmosaic do not recognize it
	# either.
	if (HTML == "3.2")
	    print_line("<CENTER>")
	for (k = 1; k <= n; ++k)
	{
	    getline
	    print_line(strtohtml($0) "<BR>")
	}
	if (HTML == "3.2")
	    print_line("</CENTER>")
    }
}


function cmd_comment(s)
{
    In_Comment = 1
    sub(/^[.']\\"/,"",s)	# remove troff comment prefix: it confuses html-pretty
    print_line("<!-- " strtohtml(s) " -->")
    In_Comment = 0
}


function cmd_comment_block(s)
{
    cmd_comment(s)

    In_Comment = 1
    while (getline s > 0)
    {
	cmd_comment(s)
	if (s ~ /^ *\\\}/)
	    break		# found end of block
    }
    In_Comment = 0
}


function cmd_I( s)
{
    end_font()
    if (match($0,/^[.]I *\"/))
    {
	s = substr($0,RSTART+RLENGTH)
	gsub(/[" ]*$/,"",s)
	print_line("<EM>" strtohtml(s) "</EM>")
    }
    else
	print_line("<EM>" strtohtml($2) "</EM>")

}


function cmd_IX()
{
    # .IX index entry lines are simply discarded
    while (match($0,/\\$/) && (getline > 0))
	;			# discard continuation lines
}


function cmd_fi()
{
    end_font()
    if (In_PRE)
    {
	print_line("</PRE>")
	In_PRE = 0
    }
    else
	cmd_comment($0)
}


function cmd_hw()
{				# .hw word-hyph-en-a-tion ex-cep-tions
    cmd_comment($0)
}


function cmd_IP()
{
    end_font()
    PP++
    print_line("<P>")
}


function cmd_LP()
{
    end_font()
    PP++
    print_line("<P>")
}


function cmd_na()
{	# .na: no adjust: turn off flush-left-and-right justification, producing ragged-right
    cmd_comment($0)		# no HTML equivalent
}


function cmd_ne()
{				# .ne dimen: need dimen vertical space before end of page
				# otherwise, force a page break (e.g. to prevent page
				# breaks after headings)
    cmd_comment($0)
}


function cmd_nf()
{
    end_font()
    if (In_PRE)
	cmd_comment($0)
    else
    {
	print_line("<PRE>")
	In_PRE = 1
    }
}


function cmd_PP()
{
    end_font()
    PP++

    if (In_PRE)			# <P> tags are illegal in <PRE>...</PRE> environments
	print_line("")
    else
	print_line("<P>")

    end_TP()
}


function cmd_R( s)
{
    end_font()
    if (match($0,/^[.]R *\"/))
    {
	s = substr($0,RSTART+RLENGTH)
	gsub(/[" ]*$/,"",s)
	print_line(strtohtml(s))
    }
    else
	print_line(strtohtml($2))
}


function cmd_RE()
{
    end_font()
    if (In_PRE)			# should not happen, but some man pages
	cmd_fi()		# are irregular
    while (List_Level > RSE_List_Level[RSE_Level])
	end_TP()
    if (RSE_Level > 0)
	RSE_Level--
    print_line("</BLOCKQUOTE>")
}


function cmd_RS()
{
    end_font()
    RSE_List_Level[++RSE_Level] = List_Level
    List_Level++		# new .TP level too
    print_line("<BLOCKQUOTE>")
}


function cmd_SH( s)
{				# section heading
    cmd_SH_SS("H1")
}


function cmd_SS( s)
{				# subsection heading
    cmd_SH_SS("H2")
}


function cmd_SH_SS(tag, s)
{				# [sub]section heading
    if (!TH_seen)		# should not happen, but some man pages are
	cmd_TH(substr($0,5))	# irregular
    end_font()
    while (RSE_Level > 0)
	cmd_RE()
    while (List_Level > 0)
	end_TP()
    if (tag == "H1")
    {
	H1++
	if (H1 == 1)
	    begin_toc()
	if (H2 > 0)
	{
	    print_toc("</LI>")
	    print_toc("</UL>")
	}
	H2 = 0
	if (H1 > 1)
	    print_line("<HR>")	# a separating horizontal rule is a nice touch
    }
    else if (tag == "H2")
    {
	H2++
    }
    s = substr($0,5)
    sub(/^ *\"/,"",s)
    sub(/\" *$/,"",s)
    s = strtohtml(s)

    SH_SS_count = "." H1
    if (H2 > 0)
	SH_SS_count = SH_SS_count "." H2

    print_line("<" tag ">")
    print_line("<A NAME=\"HDR" SH_SS_count "\">")
    print_line(s)
    print_line("</A>")
    print_line("</" tag ">")

    if (In_TOC_Item && (H2 != 1))
	print_toc("</LI>")
    if (H2 == 1)
	print_toc("<UL>")
    In_TOC_Item = 1
    print_toc("<LI>")
    print_toc("<A HREF=\"#HDR" SH_SS_count "\">")
    print_toc(s)
    print_toc("</A>")
}


function cmd_sp()
{				# .sp nnn: vertical space
    cmd_comment($0)		# no sensible HTML equivalent
}


function cmd_TH( line)
{
    end_font()
    print_line("<HTML>")
    print_line("<HEAD>")
    print_line("<TITLE>")
    line = $0
    while (line ~ /\\$/)
    {
	getline
	line = substr(line,1,length(line)-1) $0
    }
    print_line(strtohtml(substr(line,4)))
    print_line("</TITLE>")
    print_line("<LINK REV=\"made\" HREF=\"mailto:" LOGNAME "@" HOSTNAME "\">")
    print_line("</HEAD>")
    print_line("")
    print_line("<BODY>")
    print_line("")
    TH_seen = 1
}


function cmd_TP()
{
    end_font()
    getline		# this is the item label, usually "\(bu" or ".B ..."
    if (Item_Count[List_Level] == 0) # then first item of new list
    {
	List_Level++
	Item_Count[List_Level] = 0
        if ($0 == "\\(bu")
	{
	    List_Name[List_Level] = "UL"
	    List_Item[List_Level] = "LI"
	}
	else
	{
	    List_Name[List_Level] = "DL"
	    List_Item[List_Level] = "DT"
	}
	if (Item_Count[List_Level] == 0)
	    print_line("<" List_Name[List_Level] ">")
    }
    Item_Count[List_Level]++
    if (List_Name[List_Level] == "DL")
    {
	if (Item_Count[List_Level] > 1)
	    print_line("</DD>")
	print_line("<DT>")
	if ($0 ~ /^[.]B /)
	    cmd_B()
	else if ($0 ~ /^[.]I /)
	    cmd_I()
	else if ($0 ~ /^[.]R /)
	    cmd_R()
	else if ($0 ~ /^[.]BR/)
	    cmd_XY("B","R")
	else if ($0 ~ /^[.]BI/)
	    cmd_XY("B","I")
	else if ($0 ~ /^[.]IB/)
	    cmd_XY("I","B")
	else if ($0 ~ /^[.]IR/)
	    cmd_XY("I","R")
	else if ($0 ~ /^[.]RB/)
	    cmd_XY("R","B")
	else if ($0 ~ /^[.]RI/)
	    cmd_XY("R","I")
	else
	    print_line(strtohtml($0))
	end_font()
	if (In_PRE)		# should not happen, but some man pages
	    cmd_fi()		# are irregular
	print_line("</DT>")
	print_line("<DD>")
    }
    else			# must be <UL> <LI> ... </LI> </UL> type list
    {
	if (Item_Count[List_Level] > 1)
	    print_line("</LI>")
	print_line("<LI>")
    }
}


function cmd_TS( tbl_nroff_cmd)
{
    # Copy the table to a temporary file
    print $0 >TBLFILE
    while (getline > 0)
    {
	print $0 >TBLFILE
	if ($0 ~ /^[.]TE/)	# then end of table found
	    break
    }
    close (TBLFILE)

    # Run tbl, nroff, and col to convert the table to
    # formatted text, and include it as a preformatted
    # environment.
    tbl_nroff_cmd = "tbl " TBLFILE " | nroff -man | col -b"

    print_line("<PRE>")
    while ((tbl_nroff_cmd | getline) > 0)
	print_line(strtohtml($0))
    print_line("</PRE>")
    close (tbl_nroff_cmd)
    delete_file(TBLFILE)
}


function cmd_unknown()
{
    end_font()
    warning("Unrecognized nroff/troff command in [" $0 "] changed to comment")
    cmd_comment($0)
}


function cmd_XY(x,y, font,k)
{
    end_font()
    protect_quoted_args()
    for (k = 2; k <= NF; ++k)
    {
	font = Font_Map[(k % 2) ? y : x]
	printf("%s%s%s", html_font_begintag(font), strtohtml(unprotect_quoted_arg($k)), \
	       html_font_endtag(font)) > TMPFILE
    }
    print_line("")
}


function define(name, regexp)
{
    # Typical values:
    # .if n .ds Bi BibTeX
    # .el .ds Bi BibTeX
    # Macro used as \*(Bi, but stored as a regexp
    regexp = "\\\\\\*\\(" name
    Macro[regexp] = substr($0,index($0,name)+3)
}


function delete_file(s)
{
    system("/bin/rm -f " s)
}


function end_font()
{
    for (; Font_Level > 0; Font_Level--)
	print_line(html_font_endtag(HTML_Font_Name[Font_Level]))
}


function end_toc()
{
    print_toc("</LI>")
    print_toc("</UL>")
    print_toc("<HR>")
    close (TOCFILE)
}


function end_TP()
{
    if (Item_Count[List_Level] > 0)
    {
	if (List_Name[List_Level] == "DL")
	{
	    print_line("</DD>")
	    print_line("</DL>")
	}
	else
	{
	    print_line("</LI>")
	    print_line("</UL>")
	}
    }
    Item_Count[List_Level] = 0
    if (List_Level > 0)
	List_Level--
}


function font_sub(s, tag)
{
    while (match(s,/\\f[BCIPRST]/))
    {
	if (substr(s,RSTART+2,1) == "P") # revert to previous font
	{
	    tag = html_font_endtag(HTML_Font_Name[Font_Level])
	    if (Font_Level > 0)
		Font_Level--
	}
	else			# set explicit font
	{
	    Font_Level++
	    HTML_Font_Name[Font_Level] = Font_Map[substr(s,RSTART+2,1)]
	    tag = html_font_begintag(HTML_Font_Name[Font_Level])
	    # Handle ...\fB...\fR... style by ending previous font
	    if (Font_Level > 1)
	    {
		tag = html_font_endtag(HTML_Font_Name[Font_Level-1]) tag
		HTML_Font_Name[Font_Level-1] = HTML_Font_Name[Font_Level]
		Font_Level--
	    }
	}
	s = substr(s,1,RSTART-1) tag substr(s,RSTART+3)
    }
    return (s)
}


function html_font_begintag(name)
{
    if (name == "")
	return ""
    else
	return "<" name ">"
}


function html_font_endtag(name)
{
    if (name == "")
	return ""
    else
	return "</" name ">"
}


function initialize()
{
    # Change these two lines whenever the program is modified
    VERSION_NUMBER = "1.06"
    VERSION_DATE = "[24-Oct-1997]"

    VERSION = "Version " VERSION_NUMBER " " VERSION_DATE

    "echo $LOGNAME" | getline LOGNAME
    "hostname" | getline HOSTNAME
    "date" | getline DATE

    if (HTML == "")
	HTML = 2
    if ((HTML != 2) && (HTML != 3) && (HTML != "3.2"))
    {
        warning("Unsupported HTML level " HTML " requested: defaulting to HTML level 2")
	HTML = 2
    }

    Font_Map["B"] = "STRONG"
    Font_Map["C"] = "TT"
    Font_Map["I"] = "EM"
    Font_Map["R"] = ""
    Font_Map["S"] = ""		# cannot map symbol font yet
    Font_Map["T"] = "TT"

    Macro["\\\\e"]	= "\\"
    if (HTML == 2)
	Macro["\\\\0"]	= "\\&#160;"	# change non-breakable space to numeric entity
    else if (HTML >= 3)
	Macro["\\\\0"]	= "\\&nbsp;"	# can finally use named entity
    else
	warning("No conversion implemented for \\\\0 (non-breakable space) in HTML level", HTML)

    TOCFILE = "/tmp/man2html.toc"
    TBLFILE = "/tmp/man2html.tbl"
    TMPFILE = "/tmp/man2html.tmp"
    H1 = 0
    H2 = 0

    Macro["\\\\\\(bu"]	= "\\&#164;"
    Macro["\\\\\\(em"]	= "---"
    Macro["\\\\\\(en"]	= "--"

    # The following fragment for setting URL_xxx variables
    # is borrowed intact from my bibtex-to-html.awk file:
    #
    # According to Internet RFC 1614 (May 1994), a URL is
    # defined in the document T. Berners-Lee, ``Uniform
    # Resource Locators'', March 1993, available at URL
    # ftp://info.cern.ch/pub/ietf/url4.ps.  Unfortunately,
    # that address is no longer valid.  However, I was able to
    # track down pointers from http://www.w3.org/ to locate a
    # suitable description in Internet RFC 1630 (June 1994).

    # NB: We additionally disallow & in a URL because it is
    # needed in SGML entities "&name;".  We also disallow =
    # and | because these are commonly used in \path=...= and
    # \path|...| strings in BibTeX files.  These restrictions
    # could be removed if we went to the trouble of first
    # encoding these special characters in %xy hexadecimal
    # format, but they are rare enough that I am not going to
    # do so for now.  The worst that will happen from this
    # decision is that an occasional URL in a BibTeX file will
    # be missing a surrounding anchor.

    # Bug fix [24-Oct-1997]: Add < and > to the set of excluded
    # characters, to avoid incorrectly including SGML markup inside a
    # URL.  Before this fix, "\fChttp://www/\fP" got translated
    # incorrectly to
    #     <TT><A HREF="http://www/</TT>">http://www/</TT></A>
    # instead of the correct
    #     <TT><A HREF="http://www">http://www</A></TT>

    URL_PATTERN = "[A-Za-z]+://[^ \",&=|<>]+"
    URL_OFFSET = 0
    URL_PREFIX = ""
    URL_SAVE_LABEL = 0

    E_MAIL_PATTERN = "[A-Za-z0-9_-]+@[A-Za-z0-9-]+([.][A-Za-z0-9-]+)*"
    E_MAIL_OFFSET = 0
    E_MAIL_PREFIX = "mailto:"
    E_MAIL_SAVE_LABEL = 0

    print_header()
}


function print_header()
{
    print_line("<!-- Warning: Do NOT edit this file. -->")
    print_line("<!-- It was created automatically by man2html.awk " VERSION " on " DATE " -->")
    print_line("<!-- from the file " strtohtml(FILENAME) " at " HOSTNAME " -->")
    print_line("")

    if (HTML == 2)
	print_line("<!DOCTYPE HTML public \"-//IETF//DTD HTML//EN\">")
    else if (HTML == 3)	# We need level 3 HTML only because of our use of &nbsp; and &shy;
	print_line("<!DOCTYPE HTML public \"-//IETF//DTD HTML 3.0//EN\">")
    else if (HTML == "3.2")	# HTML 3.2 released 5-Nov-1996 at http://www.w3.org/pub/WWW
	print_line("<!DOCTYPE HTML public \"-//W3C//DTD HTML 3.2//EN\">")
}


function print_line(s)
{
    print s >TMPFILE
}


function print_toc(s)
{
    print s >TOCFILE
}


function protect_quoted_args( inside,k,s)
{
    if (index($0,"\"") == 0)
	return
    s = $0
    inside = 0
    for (k = 1; k <= length(s); ++k)
    {
	if (substr(s,k,1) == "\"")
	    inside = !inside
	else if (inside && (substr(s,k,1) == " "))
	    s = substr(s,1,k-1) "\177" substr(s,k+1)
    }
    $0 = s
}


function strtohtml(s, name)
{
    gsub(/\\$/,"",s)		# discard backslash-newline
    gsub(/\\-/,"-",s)		# show troff minus as ASCII minus
    gsub(/\\[&]/,"",s)		# remove no-op macros
    # gsub(/\\[|]/," ",s)	# change thin space to space
    gsub(/\\[|]/,"",s)		# delete thin space (nroff does too)

    gsub(/[&]/,"\\&amp;",s)	# protect 3 or 4
    gsub(/</,"\\&lt;",s)	# special SGML
    gsub(/>/,"\\&gt;",s)	# characters

    if (HTML == 2)
    {
	gsub(/\\ /,"\\&#160;",s)# represent literal space by numeric entity
	gsub(/\\%/,"",s)	# squeeze out discretionary hyphens
    }
    else if (HTML >= 3)
    {
	gsub(/\\ /,"\\&nbsp;",s) # preserve literal spaces

	# NB: several browers fail to implement soft hyphen properly: they show
	# it as an explicit hyphen when the word is not broken at end of line,
	# instead of discarding it.  We translate it correctly, and hope that
	# broken browsers eventually get fixed, sigh...

	gsub(/\\%/,"\\&shy;",s)	# discretionary hyphen -> soft hyphen
    }
    if (In_Comment)
	gsub(/--/,"__",s)	# must hide -- pairs to avoid grammar error
    else if (HTML == "3.2")
	gsub(/\"/,"\\&#34;",s)	# &quot; was left out of HTML 3.2, sigh...
    else
	gsub(/\"/,"\\&quot;",s)	# but other versions, and SGML, have &quot;

    # It is curious that browsers can display a bullet, but there is no
    # HTML markup to represent it, and it is absent from the standard
    # ISO8859-1 fonts
    # gsub(/\\\(bu/,"\\&#164;",s)	# change bullets to general currency sign
				# &curren; but use numeric code because
				# xmosaic does not recognize it

    for (name in Macro)		# substitute macro names
	gsub(name,Macro[name],s)

    s = font_sub(s)

    gsub(/\\\\/,"\\",s)		# reduce troff doubled backslash to single HTML one

#    if (index(s,"\\") > 0)	# check for anything we missed
#	warning("Possible unrecognized nroff/troff markup in [" s "]")

    if (!In_Comment)		# no link inside comment; otherwise, browser shows text
    {
	s = anchor(s,"HREF",URL_PATTERN,URL_OFFSET,URL_PREFIX,URL_SAVE_LABEL)
	s = anchor(s,"HREF",E_MAIL_PATTERN,E_MAIL_OFFSET,E_MAIL_PREFIX, \
		   E_MAIL_SAVE_LABEL)
    }

    return (s)
}


function terminate( x,y)
{
    print_line("</BODY>")
    print_line("</HTML>")
    close (TMPFILE)
    end_toc()

    while (getline x < TMPFILE > 0)
    {
        if (x == "<H1>")
	    break
	print x
    }

    while (getline y < TOCFILE > 0)
	print y
    close (TOCFILE)
    delete_file(TOCFILE)

    print x
    while (getline x < TMPFILE > 0)
	print x
    close (TMPFILE)
    delete_file(TMPFILE)
}


function unprotect_quoted_arg(s)
{
    sub(/^"/,"",s)		# remove leading and
    sub(/"$/,"",s)		# trailing quotes and
    gsub(/\177/," ",s)		# restore spaces
    return (s)
}


function warning(message)
{
    print FILENAME ":" FNR ":%%" message >"/dev/stderr"
}