Skip to content

Instantly share code, notes, and snippets.

@tjguk
Created September 17, 2018 13:57
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save tjguk/e485ceca53a8f279d691badf84787ed7 to your computer and use it in GitHub Desktop.
Save tjguk/e485ceca53a8f279d691badf84787ed7 to your computer and use it in GitHub Desktop.
from __future__ import with_statement
import os, sys
import htmlentitydefs
import itertools
import operator
import posixpath
import re
import sgmllib
import tempfile
class FixupParser (sgmllib.SGMLParser):
"""Fix-up parser to scan the contents file generated
by HTMLHelp, generate a suitable HTML output file for
use within standard HTML files and to provide a
root-to-leaf mapping for use as a breadcrumb trail
in individual pages.
"""
def __init__ (self, infile, outfile):
sgmllib.SGMLParser.__init__ (self)
self.infile = infile
self.outfile = outfile
self.inside_li = False
self.inside_object = False
self.link_name = self.link_url = ""
self.trail = []
self.contents_map = {}
def start (self):
self.feed (self.infile.read ())
def output (self, text):
self.outfile.write (text + "\n")
def start_ul (self, attrs):
"""If we're starting a list, close any unclosed
list item and add the latest (ie this) url/name
pair to the trail."""
if self.inside_li:
self.end_li ()
self.output ("<ul>")
if self.link_url:
self.trail.append ((self.link_url, self.link_name))
def end_ul (self):
"""If we're finishing a list, close any unclosed
list item and pop this url/name off the trail."""
if self.inside_li:
self.end_li ()
self.output ("</ul>")
if self.trail:
self.trail.pop ()
def start_li (self, attrs):
"""If we're starting a list item, make a note of the
fact so we can track objects within it."""
if self.inside_li:
self.end_li ()
self.output ("<li>")
self.inside_li = True
def end_li (self):
"""If we're finishing a list item, make a note so no
objects are tracked which are outside a list item."""
self.output ("</li>")
self.inside_li = False
def start_object (self, attrs):
"""The text/sitemap objects hold the real indexing info.
Note that we're inside such an object so that we pick up
its parameters."""
attrs = dict (attrs)
if attrs.get ("type") == "text/sitemap":
if self.inside_object:
self.end_object ()
self.link_name = self.link_url = ""
self.inside_object = True
def end_object (self):
"""At the end of an object tag, add the trail so far to
the entry for this item's index and output an appropriate
href."""
if self.inside_object:
self.contents_map[self.link_url] = self.trail[:]
if self.trail and self.trail[-1][0] <> self.link_url:
self.contents_map[self.link_url].append (("", self.link_name))
self.output ('<a href="%s">%s</a>' % (self.link_url, self.link_name))
self.inside_object = False
"""An object's param items are where the indexing info is
stored. A "Name" param holds the name of the page; a "Local"
item holds the slightly mungified URL which we strip before
storing."""
def start_param (self, attrs):
UNWANTED_PREAMBLE = "mk:@MSITStore:PyWin32.chm::/"
if self.inside_object:
attrs = dict (attrs)
if attrs.get ("name") == "Name":
self.link_name = attrs.get ("value", "<Unnamed>")
elif attrs.get ("name") == "Local":
link_url = attrs.get ("value")
if link_url:
self.link_url = link_url[len (UNWANTED_PREAMBLE):]
else:
self.link_url = "<Unlinked>"
UNWANTED_MARKUP = ["html", "body", "head"]
UNWANTED_RE = re.compile ("|".join ("<%s>|</%s>" % (markup, markup) for markup in UNWANTED_MARKUP), re.IGNORECASE)
UNWANTED_TITLE_RE = re.compile (r"<title>.*</title>", re.IGNORECASE)
UNWANTED_GENERATOR = r'<META NAME="GENERATOR" CONTENT="Autoduck, by erica@microsoft.com">'
UNWANTED_HR_RE = re.compile (r"<hr>", re.IGNORECASE)
def munged_text (text):
#
# Fix up entity & character defs so they end with semicolons
#
for entitydef in htmlentitydefs.entitydefs.keys ():
text = re.sub (r"(&%s)(?!;)" % entitydef, "\g<1>;", text)
text = re.sub (r"(&#\d+)(?!;)", "\g<1>;", text)
text = re.sub (r"<title>[^<]*</title>", "", text, re.IGNORECASE)
text = UNWANTED_RE.sub ("", text)
text = UNWANTED_TITLE_RE.sub ("", text)
text = text.replace (UNWANTED_GENERATOR, "")
text = UNWANTED_HR_RE.sub ("", text)
text = u"\n".join (line + u"</li>" if line.lower ().startswith (u"<li>") and u"</li>" not in line.lower () else line for line in text.splitlines ())
return text
def relpath (target_url, current_url):
target_path, target_file = posixpath.split (target_url)
current_path, current_file = posixpath.split (current_url)
start_list = current_path.split (os.path.sep)
path_list = target_path.split (os.path.sep)
i = len (os.path.commonprefix ([start_list, path_list]))
rel_list = [os.path.pardir] * (len (start_list) - i) + path_list[i:]
return posixpath.join ("/".join (rel_list), target_file)
INDEX_CONTENT = """
<h1>PyWin32 Documentation</h1>
<p>This documentation is generated from the .chm file which is shipped with
the PyWin32 extensions for Python. Apart from absolutely essential cleanups
to make the HTML display properly, no changes have been made.</p>
<p><b>Updated 29th October 2009</b>: Now includes pywin32-214 documentation</p>
<ul>
<li> <a href="contents.html">Table of Contents</a> </li>
<li> <a href="PyWin32.html">Front Page</a> </li>
<li> <a href="html/CHANGES.txt">Project ChangeLog</a> </li>
<!-- li> <a href="changes.html">Added / Updated pages</a></li -->
</ul>
"""
#
# Navigation is a separate string so that it can be
# excluded from, eg, the contents page.
#
NAVIGATION_HTML = """
<div class="navigation">
<a href="%(root_path)s%(toc_filename)s">Contents</a> | %(breadcrumbs)s
</div>
"""
HTML = """
<html>
<head>
<title>%(title)s</title>
<link href="%(root_path)s%(css_filename)s" rel="stylesheet" type="text/css" media="all">
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
</head>
<body>
%(navigation)s
<div id="content">
%(content)s
</div>
</body>
</html>
"""
def fixup_isapi_links (text):
return text.replace ('href="/', 'href="../../../')
SPECIAL_PROCESSING = {
"html/isapi/doc/isapi.html" : fixup_isapi_links
}
ARGS = set (["nogenerate", "debug", "nosvn"])
def main (args=[]):
if not set (args) <= ARGS:
raise RuntimeError ("Arguments %s not recognised; should only be %s" % (", ".join (set (args).difference (ARGS)), ", ".join (ARGS)))
chm_filepath = "./PyWin32.chm" ## os.path.join (sys.prefix, "lib", "site-packages", "PyWin32.chm")
html_tempdir = os.path.join (tempfile.gettempdir (), "pywin32-docs-htmlhelp")
html2_tempdir = "."
css_filename = "pywin32.css"
toc_filename = "contents.html"
changes_filename = "changes.html"
if "nogenerate" not in args:
print "Decompiling .chm..."
if not os.path.exists (html_tempdir):
os.mkdir (html_tempdir)
os.system ("hh.exe -decompile %s %s" % (html_tempdir, chm_filepath))
print "Writing index.html..."
with open (os.path.join (html2_tempdir, "index.html"), "w") as outfile:
title = "PyWin32 Documentation"
navigation = ""
root_path = ""
content = INDEX_CONTENT
outfile.write (HTML % locals ())
print "Generating contents..."
with open (os.path.join (html_tempdir, "pywin32.hhc")) as infile:
handle, filename = tempfile.mkstemp ()
with open (filename, "w") as outfile:
parser = FixupParser (infile, outfile)
parser.start ()
contents_map = parser.contents_map
print "Writing table of contents..."
with open (os.path.join (html2_tempdir, toc_filename), "w") as outfile:
title = "PyWin32 Documentation"
content = open (filename).read ()
root_path = ""
css_filename = css_filename
navigation = ""
outfile.write (HTML % locals ())
for html_dirname, dirnames, filenames in os.walk (html_tempdir, topdown=True):
if "debug" in args: filenames = filenames[:30]
html2_dirname = os.path.join (html2_tempdir, html_dirname[1+len (html_tempdir):])
print html_dirname, "=>", html2_dirname
if not os.path.exists (html2_dirname):
os.mkdir (html2_dirname)
for filename in filenames:
if not filename.lower ().endswith ((".txt", ".html")): continue
html_filepath = os.path.join (html_dirname, filename)
html2_filepath = os.path.join (html2_dirname, filename)
depth = html2_filepath.count ("\\") - 1
print " %s (%d)" % (html_filepath, depth)
root_path = "../" * depth
relative_filepath = html_filepath[1+len (html_tempdir):].replace ("\\", "/")
content = unicode (open (html_filepath).read (), "cp1252")
content = munged_text (content)
special_processing = SPECIAL_PROCESSING.get (relative_filepath)
if special_processing:
content = special_processing (content)
for title in re.findall (r"<h1>([^<]+)</h1>", content, re.IGNORECASE):
break
else:
title = filename
breadcrumb_trail = contents_map.get (relative_filepath, [])
breadcrumbs = u" &gt; ".join (u'<a href="%s">%s</a>' % (relpath (url, relative_filepath) if url else name, name) for (url, name) in breadcrumb_trail)
navigation = NAVIGATION_HTML % locals ()
if filename.lower ().endswith (".txt"):
html = content
else:
html = HTML % locals ()
open (html2_filepath, "w").write (html.encode ("utf8"))
if "nosvn" not in args:
import pysvn
print "Finding changes..."
UNCHANGED = [pysvn.wc_status_kind.normal, pysvn.wc_status_kind.ignored]
EXCLUDE_FROM_CHANGES = ["changes.html", "convert_to_html.py", "pywin32.chm"]
svn = pysvn.Client ()
svn.add ([i.path for i in svn.status (".") if i.path.endswith (".html") and i.text_status == pysvn.wc_status_kind.unversioned])
changes = sorted ((i for i in svn.status (".") if i.text_status not in UNCHANGED), key=operator.attrgetter ("text_status"))
content = ["<h1>PyWin32 Documentation Changes</h1>"]
first = True
for status, items in itertools.groupby (changes, operator.attrgetter ("text_status")):
if not first: content.append ("</ul>")
content.append ("<h2>%s</h2>" % status)
content.append ("<ul>")
for item in items:
if os.path.basename (item.path).lower () in EXCLUDE_FROM_CHANGES: continue
content.append ('<li><a href="%s">%s</a></li>' % (item.path, os.path.splitext (item.path)[0]))
first = False
if not first: content.append ("</ul>")
print "Writing changes..."
with open (os.path.join (html2_tempdir, changes_filename), "w") as outfile:
title = "PyWin32 Documentation Changes"
content = "\n".join (content)
root_path = ""
css_filename = css_filename
navigation = ""
outfile.write (HTML % locals ())
if __name__ == '__main__':
main (sys.argv[1:])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment