Last active
December 10, 2015 16:38
-
-
Save tomstuart/4462577 to your computer and use it in GitHub Desktop.
Syntax highlighting around inline markup
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
require 'nokogiri' | |
def highlight(document) | |
document.xpath('descendant::*[self::programlisting or self::screen][attribute::language]').each do |element| | |
tokens_and_types = tokenize(element.content, element[:language]) | |
next_token, next_type = tokens_and_types.shift | |
element.xpath('descendant::text()').each do |text| | |
content = text.content | |
highlighted = Nokogiri::XML::DocumentFragment.new(document) | |
until content.empty? | |
prefix, next_token, content = remove_common_prefix(next_token, content) | |
highlighted.add_child(document.create_element('phrase', prefix, role: next_type)) | |
next_token, next_type = tokens_and_types.shift if next_token.empty? | |
end | |
text.replace(highlighted) | |
end | |
end | |
end | |
def tokenize(string, language) | |
get_raw_tokens(string, language).map { |line| parse_raw_token(line) } | |
end | |
def get_raw_tokens(string, language) | |
IO.popen(['pygmentize', '-l', language, '-f', 'raw'], 'r+') do |io| | |
io.print(string) | |
io.close_write | |
io.readlines | |
end | |
end | |
def parse_raw_token(string) | |
type, token = string.chomp.split(/\t/) | |
[python_repr_to_ruby_string(token), type] | |
end | |
def python_repr_to_ruby_string(repr) | |
raise unless repr =~ /\Au'(.*)'\z/ | |
# TODO also handle \newline, \a, \b, \f, \uxxxx etc | |
$1.gsub(/\\[\\'"nrt]/, { | |
'\\\\' => '\\', | |
'\\\'' => '\'', | |
'\\"' => '"', | |
'\\n' => "\n", | |
'\\r' => "\r", | |
'\\t' => "\t" | |
}) | |
end | |
def remove_common_prefix(a, b) | |
prefix = common_prefix(a, b) | |
[prefix, a.slice(prefix.length..-1), b.slice(prefix.length..-1)] | |
end | |
def common_prefix(a, b) | |
a.length.downto(0). | |
map { |n| a.slice(0, n) }. | |
detect { |s| b.start_with?(s) } | |
end | |
document = Nokogiri::XML::Document.parse <<eod | |
<book> | |
<chapter> | |
<screen language="irb">>> <userinput>[:a, :b, :c].length</userinput> | |
=> 3</screen> | |
<programlisting language="ruby">class Foo | |
<emphasis role="bold">def foo(bar)</emphasis> | |
foo = b<emphasis>ar + 1</emphasis> | |
end | |
end</programlisting> | |
</chapter> | |
</book> | |
eod | |
highlight(document) | |
puts document.to_xml | |
# <?xml version="1.0"?> | |
# <book> | |
# <chapter> | |
# <screen language="irb"><phrase role="Token.Operator"></phrase><phrase role="Token.Generic.Prompt">>> </phrase><userinput><phrase role="Token.Operator">[</phrase><phrase role="Token.Literal.String.Symbol">:a</phrase><phrase role="Token.Punctuation">,</phrase><phrase role="Token.Text"> </phrase><phrase role="Token.Literal.String.Symbol">:b</phrase><phrase role="Token.Punctuation">,</phrase><phrase role="Token.Text"> </phrase><phrase role="Token.Literal.String.Symbol">:c</phrase><phrase role="Token.Operator">]</phrase><phrase role="Token.Operator">.</phrase><phrase role="Token.Name">length</phrase></userinput><phrase role="Token.Text"> | |
# </phrase><phrase role="Token.Generic.Output">=> 3</phrase></screen> | |
# | |
# <programlisting language="ruby"><phrase role="Token.Keyword">class</phrase><phrase role="Token.Text"> </phrase><phrase role="Token.Name.Class">Foo</phrase><phrase role="Token.Text"> | |
# </phrase><emphasis role="bold"><phrase role="Token.Keyword">def</phrase><phrase role="Token.Text"> </phrase><phrase role="Token.Name.Function">foo</phrase><phrase role="Token.Punctuation">(</phrase><phrase role="Token.Name">bar</phrase><phrase role="Token.Punctuation">)</phrase></emphasis><phrase role="Token.Text"> | |
# </phrase><phrase role="Token.Name">foo</phrase><phrase role="Token.Text"> </phrase><phrase role="Token.Operator">=</phrase><phrase role="Token.Text"> </phrase><phrase role="Token.Name">b</phrase><emphasis><phrase role="Token.Name">ar</phrase><phrase role="Token.Text"> </phrase><phrase role="Token.Operator">+</phrase><phrase role="Token.Text"> </phrase><phrase role="Token.Literal.Number.Integer">1</phrase></emphasis><phrase role="Token.Text"> | |
# </phrase><phrase role="Token.Keyword">end</phrase><phrase role="Token.Text"> | |
# </phrase><phrase role="Token.Keyword">end</phrase></programlisting> | |
# </chapter> | |
# </book> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment