Skip to content

Instantly share code, notes, and snippets.

@brandonmwest
Created July 2, 2014 17:25
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save brandonmwest/9a4a7f3eb3cd474f666f to your computer and use it in GitHub Desktop.
Save brandonmwest/9a4a7f3eb3cd474f666f to your computer and use it in GitHub Desktop.
convert jekyll html with custom liquid tags and codeblocks to well-structured markdown
require 'rubygems'
require 'json'
require 'nokogiri'
require 'nokogiri-pretty'
require 'iconv'
require 'pandoc-ruby'
require 'fileutils'
#convert HTML to markdown
html_files = File.join("/Users/brandonwest/SendGrid/docs/source", "**", "*.html")
#need to pull all codeblocks out, stash them, and put them back in to preserve formatting
Dir.glob html_files do |html_file|
next if html_file == '.' or html_file == '..' or html_file.match('_layouts') or html_file.match('_includes') or html_file.match('_assets') or html_file.include?('index.html') or html_file.match('search.html') or html_file.match('code_workshop.html') or html_file.match('api_workshop.html') or html_file.match('error.html')
puts "Converting #{html_file}"
file = File.open(html_file, "r:UTF-8")
html = file.read
file.close
#replace the front-matter from the HTML file
front_matter = html.match(/(\s?---\s?)(.*?)(\s?---\s?)/m);
#Convert to markdown!
contents = PandocRuby.html(html).convert({:f => :html, :to => "markdown_mmd-pipe_tables" }, 'no-wrap', 'parse-raw', 'atx-headers')
contents.sub!(/(\s?---\s?)(.*?)(\s?---\s?)/m,"---\n" + front_matter[2] + "\n---\n")
##replace the codeblocks with the original ones
contents.gsub!(/{%\s?codeblock .*?\s+%}.*?{\%\s?endcodeblock\s?%}/m) do |match|
#replace the match with the first matched codeblock from the html
replace = "\n\n" + html.match(/{%\s?codeblock .*?\s+%}.*?{\%\s?endcodeblock\s?%}/m)[0].to_s + "\n\n"
replace.gsub!(/&(?!amp)/,"&")
#delete the matched codeblock from the HTML so we keep the indexes sync'd
html.sub!(/{%\s?codeblock .*?\s+%}.*?{\%\s?endcodeblock\s?%}/m,"")
replace
end
#put linebreaks before and after anchors and info blocks
contents.gsub!(/{%\s?anchor\s?(.*?)\s?%}\s?/) { |match| "\n{% anchor #{$1} %}\n" }
contents.gsub!(/{%\s?endanchor\s?%}\s?/,"\n{% endanchor %}\n")
contents.gsub!(/{%\s?info\s?%}\s?/,"\n{% info %}\n")
contents.gsub!(/{%\s?endinfo\s?%}\s?/,"\n{% endinfo %}\n")
contents.gsub!(/{%\s?warning\s?%}\s?/,"\n{% warning %}\n")
contents.gsub!(/{%\s?endwarning\s?%}\s?/,"\n{% endwarning %}\n")
contents.gsub!(/{%\s?apiexample\s?(.*?)\s?%}\s?/) { |match| "\n{% apiexample #{$1} %}" }
contents.gsub!(/{%\s?endapiexample\s?%}\s?/,"{% endapiexample %}\n")
contents.gsub!(/{%\s?requestblock\s?%}\s?/,"\n{% requestblock %}\n")
contents.gsub!(/{%\s?endrequestblock\s?%}\s?/,"\n{% endrequestblock %}\n")
contents.gsub!(/{%\s?parameter\s(.*?)\s?%}\s?/) { |match| "\n {% parameter #{$1} %}" }
contents.gsub!(/{%\s?endparameter\s?%}\s?/,"\n {% endparameter %}\n")
contents.gsub!(/{%\s?requesturl\s?(.*?)\s?%}\s?/) { |match| "\n {% requesturl #{$1} %}" }
contents.gsub!(/{%\s?endrequesturl\s?%}\s?/,"\n {% endrequesturl %}\n")
contents.gsub!(/{%\s?requestdata\s?(.*?)\s?%}\s?/) { |match| "\n {% requestdata #{$1} %}" }
contents.gsub!(/{%\s?endrequestdata\s?%}\s?/,"\n {% endrequestdata %}\n")
contents.gsub!(/{%\s?parameter\s(.*?)\s?%}\s?/) { |match| "\n {% parameter #{$1} %}" }
contents.gsub!(/{%\s?endparameter\s?%}\s?/,"\n {% endparameter %}\n")
#Pretty print the JSON
contents.gsub!(/({%\s?codeblock lang:json\s?%})(.*?)({\%\s?endcodeblock\s?%})/m) do |match|
begin
json = JSON.parse($2)
valid = true
rescue
puts "\ninvalid JSON or non-JSON javascript block in #{html_file}: #{$2}\n)"
valid = false
end
if valid
"\n{% codeblock lang:json %}" + "\n" + JSON.pretty_generate(json) + "\n" + "{% endcodeblock %}\n"
else
puts $2
"\n{% codeblock lang:json %}" + "\n" + $2 + "\n" + "{% endcodeblock %}\n"
end
end
contents.gsub!(/(\s?{%\s?response json\s?%}\s?)(.*?)({\%\s?endresponse\s?%}\s?)/m) do |match|
begin
json = JSON.parse($2)
valid = true
rescue
puts "\ninvalid JSON or non-JSON javascript block in #{html_file}: #{$2}\n)"
valid = false
end
if valid
"\n {% response json %}" + "\n" + JSON.pretty_generate(json) + "\n" + "{% endresponse %}\n"
else
puts $2
"\n {% response json %}" + "\n" + $2 + "\n" + " {% endresponse %}\n"
end
end
#Pretty print the XML
contents.gsub!(/(\s?{%\s?codeblock lang:xml\s?%})(.*?)({\%\s?endcodeblock\s?%})/m) do |match|
begin
xml = Nokogiri.XML($2, nil, "UTF-8")
rescue
puts "\ninvalid XML block in #{html_file}: #{$2}\n)"
next
end
"\n{% codeblock lang:xml %}" + "\n" + xml.human + "\n" + "{% endcodeblock %}\n"
end
contents.gsub!(/(\s?{%\s?response xml\s?%}\s?)(.*?)({\%\s?endresponse\s?%}\s?)/m) do |match|
begin
xml = Nokogiri.XML($2, nil, "UTF-8")
rescue
puts "\ninvalid XML block in #{html_file}: #{$2}\n)"
next
end
"\n {% response xml %}" + "\n" + xml.human + "\n" + " {% endresponse %}\n"
end
#Pretty print all the tables that pandoc mangled
contents.gsub!(/(<table.*?>)(.*?)(<\/table>)/m) do
begin
html = '<table class="table table-bordered table-striped">' + $2.to_s.gsub!("\n","") + '</table>'
xml = Nokogiri.XML(html, nil, "UTF-8")
rescue
puts "\ninvalid HTML block in #{html_file}:\n)"
next
end
pretty_html = Iconv.conv 'UTF-8', 'iso8859-1', xml.human
pretty_html.gsub!(/<\?xml version="1\.0" encoding="ISO-8859-1"\?>\s*/,"")
pretty_html.gsub!(/\s*class="odd"\s*/,"")
pretty_html.gsub!(/\s*class="even"\s*/,"")
pretty_html.gsub!(/\s*class="header"\s*/,"")
pretty_html.gsub!(/\s*align="left"\s*/,"")
pretty_html.gsub!(/\s*markdown="1"\s*/,"")
pretty_html
end
contents.gsub!(/\s*markdown="1"\s*/,"")
contents.gsub!(/\\_/,"_")
output_path = html_file.sub('.html','.md')#.sub('source_html','source')
dirname = File.dirname(output_path)
unless File.directory?(dirname)
FileUtils.mkdir_p(dirname)
end
FileUtils.rm_f(output_path)
file = File.new(output_path,"w:UTF-8")
file.write(contents)
file.close
##CAREFUL!
File.delete(html_file)
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment