Skip to content

Instantly share code, notes, and snippets.

@mzjn
Created November 20, 2011 15:50
Show Gist options
  • Save mzjn/9ba641b40af20b13fbfa to your computer and use it in GitHub Desktop.
Save mzjn/9ba641b40af20b13fbfa to your computer and use it in GitHub Desktop.
Jython implementation of the DocBook XSLT 2 'highlight' extension for Saxon 9.3 or later
from os.path import exists
from array import array
from java.io import StringReader, File
from javax.xml.transform.sax import SAXSource
from javax.xml.transform.stream import StreamSource
from org.xml.sax import InputSource
from net.sf.saxon.lib import ExtensionFunctionCall, ExtensionFunctionDefinition
from net.sf.saxon.om import StructuredQName
from net.sf.saxon.om import Axis as omAxis
from net.sf.saxon.value import SequenceType
from net.sf.saxon.s9api import Axis, QName, Processor, Serializer, XdmAtomicValue
from pygments import highlight
from pygments.lexers import get_lexer_by_name, guess_lexer
from pygments.formatters import HtmlFormatter, get_formatter_by_name
XHTML_NS = "http://www.w3.org/1999/xhtml"
FO_NS = "http://www.w3.org/1999/XSL/Format"
EXT_NS = "http://docbook.org/extensions/xslt20"
qName = StructuredQName("", EXT_NS, "highlight")
h_pre = QName("", XHTML_NS, "pre")
f_wrapper = QName("", FO_NS, "wrapper")
cssfile = "highlight.css"
class Pygmenter(ExtensionFunctionDefinition):
def getFunctionQName(self):
return qName
def getMinimumNumberOfArguments(self):
return 1
def getMaximumNumberOfArguments(self):
return 2
def getArgumentTypes(self):
return array(SequenceType, [SequenceType.ATOMIC_SEQUENCE])
def getResultType(self, suppliedArgumentTypes):
return SequenceType.NODE_SEQUENCE
def makeCallExpression(self):
return HighlightCall()
class HighlightCall(ExtensionFunctionCall):
def call(self, arguments, context):
# With Saxon 9.3, 'code' is a unicode object
# With Saxon 9.4, 'code' is a net.sf.saxon.tree.util.FastStringBuffer
code = arguments[0].next().getPrimitiveStringValue()
# Need this with Saxon 9.4 to avoid
# AttributeError: 'FastStringBuffer' object has no attribute 'decode'
# which is raised in Pygment's lexer.py module.
if not isinstance(code, unicode):
code = code.toString()
language = ""
if len(arguments) > 1:
language = arguments[1].next().getPrimitiveStringValue()
pygmenter = DocBookPygmenter()
if fo:
pygmenter.setFormatter("fo")
result = pygmenter.format(code, language)
processor = Processor(context.getConfiguration())
builder = processor.newDocumentBuilder()
try:
source = SAXSource(InputSource(StringReader(result)))
xdmnode = builder.build(source)
except Exception, sae:
# I don't ever expect this to happen
raise RuntimeError(sae)
elem = None
if fo:
iterator = xdmnode.axisIterator(Axis.DESCENDANT, f_wrapper)
else:
iterator = xdmnode.axisIterator(Axis.DESCENDANT, h_pre) # s9api.XdmSequenceIterator
while elem == None and iterator.hasNext():
elem = iterator.next() # net.sf.saxon.s9api.XdmNode
un = elem.getUnderlyingNode() # net.sf.saxon.tree.tiny.TinyElementImpl
se = un.iterateAxis(omAxis.CHILD) # net.sf.saxon.tree.tiny.SiblingEnumeration
# (subclass of net.sf.saxon.om.SequenceIterator)
return se
class DocBookPygmenter(object):
def __init__(self, fmt="html"):
self.formatname = fmt
def setFormatter(self, name):
self.formatname = name
def format(self, code, language):
if language == "":
lexer = guess_lexer(code)
else:
lexer = get_lexer_by_name(language)
if self.formatname == "fo":
formatter = FoFormatter()
else:
formatter = HtmlFormatter(linenos=False)
result = highlight(code, lexer, formatter)
if self.formatname == "html":
result = "<div xmlns='%s'>%s</div>" % (XHTML_NS, result)
return result
def make_css():
css = HtmlFormatter().get_style_defs()
f = open(cssfile, "w")
f.write(css)
f.close()
def transform(xml, xsl, out):
proc = Processor(True)
proc.registerExtensionFunction(Pygmenter())
comp = proc.newXsltCompiler()
source = proc.newDocumentBuilder().build(StreamSource(File(xml)))
compiled = comp.compile(StreamSource(File(xsl)))
result = proc.newSerializer(File(out))
trans = compiled.load()
trans.setInitialContextNode(source)
trans.setDestination(result)
trans.setParameter(QName("docbook.css"), XdmAtomicValue(cssfile))
trans.transform()
if not fo and not exists(cssfile):
make_css()
if __name__ == '__main__':
xml = "verbatim.xml"
xsl = "../xslt/base/html/docbook.xsl"
out = xml.replace("xml", "html")
fo = True
if fo:
# Register XSL-FO formatter
from pygments.formatters import _mapping
from xslfo import FoFormatter
_mapping.FORMATTERS[FoFormatter] = ('XSL Formatting Objects',
('fo', 'FO', 'XSL-FO'), ('*.fo',),
'Format tokens as fo:inline elements.')
xsl = xsl.replace("html", "fo")
out = xml.replace("xml", "fo")
transform(xml, xsl, out)
@mzjn
Copy link
Author

mzjn commented Nov 20, 2011

This is a "pure Jython" implementation of the DocBook XSLT 2 'highlight' extension for the Saxon XSLT processor. The inspiration came from this blog post: http://norman.walsh.name/2011/08/31/xsltPygments, which shows how Pygments can be called from Java via Jython.

For XSL-FO output, the formatter in https://gist.github.com/89fe2c4685f71e07e941 can be used.

@mzjn
Copy link
Author

mzjn commented Dec 22, 2011

Updated with workaround for non-obvious difference between Saxon 9.3 and 9.4.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment