Created
September 17, 2018 13:57
-
-
Save tjguk/e485ceca53a8f279d691badf84787ed7 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from __future__ import with_statement | |
import os, sys | |
import htmlentitydefs | |
import itertools | |
import operator | |
import posixpath | |
import re | |
import sgmllib | |
import tempfile | |
class FixupParser (sgmllib.SGMLParser): | |
"""Fix-up parser to scan the contents file generated | |
by HTMLHelp, generate a suitable HTML output file for | |
use within standard HTML files and to provide a | |
root-to-leaf mapping for use as a breadcrumb trail | |
in individual pages. | |
""" | |
def __init__ (self, infile, outfile): | |
sgmllib.SGMLParser.__init__ (self) | |
self.infile = infile | |
self.outfile = outfile | |
self.inside_li = False | |
self.inside_object = False | |
self.link_name = self.link_url = "" | |
self.trail = [] | |
self.contents_map = {} | |
def start (self): | |
self.feed (self.infile.read ()) | |
def output (self, text): | |
self.outfile.write (text + "\n") | |
def start_ul (self, attrs): | |
"""If we're starting a list, close any unclosed | |
list item and add the latest (ie this) url/name | |
pair to the trail.""" | |
if self.inside_li: | |
self.end_li () | |
self.output ("<ul>") | |
if self.link_url: | |
self.trail.append ((self.link_url, self.link_name)) | |
def end_ul (self): | |
"""If we're finishing a list, close any unclosed | |
list item and pop this url/name off the trail.""" | |
if self.inside_li: | |
self.end_li () | |
self.output ("</ul>") | |
if self.trail: | |
self.trail.pop () | |
def start_li (self, attrs): | |
"""If we're starting a list item, make a note of the | |
fact so we can track objects within it.""" | |
if self.inside_li: | |
self.end_li () | |
self.output ("<li>") | |
self.inside_li = True | |
def end_li (self): | |
"""If we're finishing a list item, make a note so no | |
objects are tracked which are outside a list item.""" | |
self.output ("</li>") | |
self.inside_li = False | |
def start_object (self, attrs): | |
"""The text/sitemap objects hold the real indexing info. | |
Note that we're inside such an object so that we pick up | |
its parameters.""" | |
attrs = dict (attrs) | |
if attrs.get ("type") == "text/sitemap": | |
if self.inside_object: | |
self.end_object () | |
self.link_name = self.link_url = "" | |
self.inside_object = True | |
def end_object (self): | |
"""At the end of an object tag, add the trail so far to | |
the entry for this item's index and output an appropriate | |
href.""" | |
if self.inside_object: | |
self.contents_map[self.link_url] = self.trail[:] | |
if self.trail and self.trail[-1][0] <> self.link_url: | |
self.contents_map[self.link_url].append (("", self.link_name)) | |
self.output ('<a href="%s">%s</a>' % (self.link_url, self.link_name)) | |
self.inside_object = False | |
"""An object's param items are where the indexing info is | |
stored. A "Name" param holds the name of the page; a "Local" | |
item holds the slightly mungified URL which we strip before | |
storing.""" | |
def start_param (self, attrs): | |
UNWANTED_PREAMBLE = "mk:@MSITStore:PyWin32.chm::/" | |
if self.inside_object: | |
attrs = dict (attrs) | |
if attrs.get ("name") == "Name": | |
self.link_name = attrs.get ("value", "<Unnamed>") | |
elif attrs.get ("name") == "Local": | |
link_url = attrs.get ("value") | |
if link_url: | |
self.link_url = link_url[len (UNWANTED_PREAMBLE):] | |
else: | |
self.link_url = "<Unlinked>" | |
UNWANTED_MARKUP = ["html", "body", "head"] | |
UNWANTED_RE = re.compile ("|".join ("<%s>|</%s>" % (markup, markup) for markup in UNWANTED_MARKUP), re.IGNORECASE) | |
UNWANTED_TITLE_RE = re.compile (r"<title>.*</title>", re.IGNORECASE) | |
UNWANTED_GENERATOR = r'<META NAME="GENERATOR" CONTENT="Autoduck, by erica@microsoft.com">' | |
UNWANTED_HR_RE = re.compile (r"<hr>", re.IGNORECASE) | |
def munged_text (text): | |
# | |
# Fix up entity & character defs so they end with semicolons | |
# | |
for entitydef in htmlentitydefs.entitydefs.keys (): | |
text = re.sub (r"(&%s)(?!;)" % entitydef, "\g<1>;", text) | |
text = re.sub (r"(&#\d+)(?!;)", "\g<1>;", text) | |
text = re.sub (r"<title>[^<]*</title>", "", text, re.IGNORECASE) | |
text = UNWANTED_RE.sub ("", text) | |
text = UNWANTED_TITLE_RE.sub ("", text) | |
text = text.replace (UNWANTED_GENERATOR, "") | |
text = UNWANTED_HR_RE.sub ("", text) | |
text = u"\n".join (line + u"</li>" if line.lower ().startswith (u"<li>") and u"</li>" not in line.lower () else line for line in text.splitlines ()) | |
return text | |
def relpath (target_url, current_url): | |
target_path, target_file = posixpath.split (target_url) | |
current_path, current_file = posixpath.split (current_url) | |
start_list = current_path.split (os.path.sep) | |
path_list = target_path.split (os.path.sep) | |
i = len (os.path.commonprefix ([start_list, path_list])) | |
rel_list = [os.path.pardir] * (len (start_list) - i) + path_list[i:] | |
return posixpath.join ("/".join (rel_list), target_file) | |
INDEX_CONTENT = """ | |
<h1>PyWin32 Documentation</h1> | |
<p>This documentation is generated from the .chm file which is shipped with | |
the PyWin32 extensions for Python. Apart from absolutely essential cleanups | |
to make the HTML display properly, no changes have been made.</p> | |
<p><b>Updated 29th October 2009</b>: Now includes pywin32-214 documentation</p> | |
<ul> | |
<li> <a href="contents.html">Table of Contents</a> </li> | |
<li> <a href="PyWin32.html">Front Page</a> </li> | |
<li> <a href="html/CHANGES.txt">Project ChangeLog</a> </li> | |
<!-- li> <a href="changes.html">Added / Updated pages</a></li --> | |
</ul> | |
""" | |
# | |
# Navigation is a separate string so that it can be | |
# excluded from, eg, the contents page. | |
# | |
NAVIGATION_HTML = """ | |
<div class="navigation"> | |
<a href="%(root_path)s%(toc_filename)s">Contents</a> | %(breadcrumbs)s | |
</div> | |
""" | |
HTML = """ | |
<html> | |
<head> | |
<title>%(title)s</title> | |
<link href="%(root_path)s%(css_filename)s" rel="stylesheet" type="text/css" media="all"> | |
<meta http-equiv="Content-Type" content="text/html; charset=utf-8"> | |
</head> | |
<body> | |
%(navigation)s | |
<div id="content"> | |
%(content)s | |
</div> | |
</body> | |
</html> | |
""" | |
def fixup_isapi_links (text): | |
return text.replace ('href="/', 'href="../../../') | |
SPECIAL_PROCESSING = { | |
"html/isapi/doc/isapi.html" : fixup_isapi_links | |
} | |
ARGS = set (["nogenerate", "debug", "nosvn"]) | |
def main (args=[]): | |
if not set (args) <= ARGS: | |
raise RuntimeError ("Arguments %s not recognised; should only be %s" % (", ".join (set (args).difference (ARGS)), ", ".join (ARGS))) | |
chm_filepath = "./PyWin32.chm" ## os.path.join (sys.prefix, "lib", "site-packages", "PyWin32.chm") | |
html_tempdir = os.path.join (tempfile.gettempdir (), "pywin32-docs-htmlhelp") | |
html2_tempdir = "." | |
css_filename = "pywin32.css" | |
toc_filename = "contents.html" | |
changes_filename = "changes.html" | |
if "nogenerate" not in args: | |
print "Decompiling .chm..." | |
if not os.path.exists (html_tempdir): | |
os.mkdir (html_tempdir) | |
os.system ("hh.exe -decompile %s %s" % (html_tempdir, chm_filepath)) | |
print "Writing index.html..." | |
with open (os.path.join (html2_tempdir, "index.html"), "w") as outfile: | |
title = "PyWin32 Documentation" | |
navigation = "" | |
root_path = "" | |
content = INDEX_CONTENT | |
outfile.write (HTML % locals ()) | |
print "Generating contents..." | |
with open (os.path.join (html_tempdir, "pywin32.hhc")) as infile: | |
handle, filename = tempfile.mkstemp () | |
with open (filename, "w") as outfile: | |
parser = FixupParser (infile, outfile) | |
parser.start () | |
contents_map = parser.contents_map | |
print "Writing table of contents..." | |
with open (os.path.join (html2_tempdir, toc_filename), "w") as outfile: | |
title = "PyWin32 Documentation" | |
content = open (filename).read () | |
root_path = "" | |
css_filename = css_filename | |
navigation = "" | |
outfile.write (HTML % locals ()) | |
for html_dirname, dirnames, filenames in os.walk (html_tempdir, topdown=True): | |
if "debug" in args: filenames = filenames[:30] | |
html2_dirname = os.path.join (html2_tempdir, html_dirname[1+len (html_tempdir):]) | |
print html_dirname, "=>", html2_dirname | |
if not os.path.exists (html2_dirname): | |
os.mkdir (html2_dirname) | |
for filename in filenames: | |
if not filename.lower ().endswith ((".txt", ".html")): continue | |
html_filepath = os.path.join (html_dirname, filename) | |
html2_filepath = os.path.join (html2_dirname, filename) | |
depth = html2_filepath.count ("\\") - 1 | |
print " %s (%d)" % (html_filepath, depth) | |
root_path = "../" * depth | |
relative_filepath = html_filepath[1+len (html_tempdir):].replace ("\\", "/") | |
content = unicode (open (html_filepath).read (), "cp1252") | |
content = munged_text (content) | |
special_processing = SPECIAL_PROCESSING.get (relative_filepath) | |
if special_processing: | |
content = special_processing (content) | |
for title in re.findall (r"<h1>([^<]+)</h1>", content, re.IGNORECASE): | |
break | |
else: | |
title = filename | |
breadcrumb_trail = contents_map.get (relative_filepath, []) | |
breadcrumbs = u" > ".join (u'<a href="%s">%s</a>' % (relpath (url, relative_filepath) if url else name, name) for (url, name) in breadcrumb_trail) | |
navigation = NAVIGATION_HTML % locals () | |
if filename.lower ().endswith (".txt"): | |
html = content | |
else: | |
html = HTML % locals () | |
open (html2_filepath, "w").write (html.encode ("utf8")) | |
if "nosvn" not in args: | |
import pysvn | |
print "Finding changes..." | |
UNCHANGED = [pysvn.wc_status_kind.normal, pysvn.wc_status_kind.ignored] | |
EXCLUDE_FROM_CHANGES = ["changes.html", "convert_to_html.py", "pywin32.chm"] | |
svn = pysvn.Client () | |
svn.add ([i.path for i in svn.status (".") if i.path.endswith (".html") and i.text_status == pysvn.wc_status_kind.unversioned]) | |
changes = sorted ((i for i in svn.status (".") if i.text_status not in UNCHANGED), key=operator.attrgetter ("text_status")) | |
content = ["<h1>PyWin32 Documentation Changes</h1>"] | |
first = True | |
for status, items in itertools.groupby (changes, operator.attrgetter ("text_status")): | |
if not first: content.append ("</ul>") | |
content.append ("<h2>%s</h2>" % status) | |
content.append ("<ul>") | |
for item in items: | |
if os.path.basename (item.path).lower () in EXCLUDE_FROM_CHANGES: continue | |
content.append ('<li><a href="%s">%s</a></li>' % (item.path, os.path.splitext (item.path)[0])) | |
first = False | |
if not first: content.append ("</ul>") | |
print "Writing changes..." | |
with open (os.path.join (html2_tempdir, changes_filename), "w") as outfile: | |
title = "PyWin32 Documentation Changes" | |
content = "\n".join (content) | |
root_path = "" | |
css_filename = css_filename | |
navigation = "" | |
outfile.write (HTML % locals ()) | |
if __name__ == '__main__': | |
main (sys.argv[1:]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment