Skip to content

Instantly share code, notes, and snippets.

@coffeeaddict
Created March 8, 2013 08:32
Show Gist options
  • Save coffeeaddict/5115012 to your computer and use it in GitHub Desktop.
Save coffeeaddict/5115012 to your computer and use it in GitHub Desktop.
HTML 2 Markdown using a SAX Parser. (WIP)
# html2markdown.rb
require 'nokogiri'
module Html2Markdown
class HtmlDocument < Nokogiri::XML::SAX::Document
MAP = { "b" => "strong", "i" => "em" }
def initialize()
@list = nil
@list_depth = 0
@ol_count = 0
@element = nil
@capture_buffer = ""
super()
end
def captured
@capture_buffer
end
def characters(string)
if %[p li h1 h2 h3 h4 h5 strong b i em].include?(@element)
@capture_buffer += string
else
$stderr.puts "Would have added #{string}, but do not know #{@element}"
end
end
def start_element(name, attributes)
name.downcase!
name = MAP[name] if MAP.has_key?(name)
@element = name
method = :"handle_#{name}"
return unless self.respond_to?(method)
if self.method(method).arity > 0
self.send(method, attributes)
else
self.send(method)
end
end
def end_element(name)
name.downcase!
name = MAP[name] if MAP.has_key?(name)
method = :"end_#{name}"
if !self.respond_to?(method)
method = :"handle_#{name}"
return unless self.respond_to?(method)
end
self.send(method)
end
def handle_p
end
alias_method :handle_br, :handle_p
def end_p
@capture_buffer += "\n\n"
end
def end_br
@capture_buffer += "\n"
end
def handle_strong
@capture_buffer += "**"
end
def handle_em
@capture_buffer += "*"
end
1.upto(5) do |i|
header = "#" * i
self.send(:define_method, :"handle_h#{i}") do
@capture_buffer += "#{header} "
end
self.send(:define_method, :"end_h#{i}") do
@capture_buffer += "\n"
end
end
def handle_code
@capture_buffer += "`"
end
def handle_ul
@list = :ul
@list_depth += 1
end
def end_ul
@list_depth -= 1
@ol_count = 0 if @list == :ol
end
alias_method :end_ol, :end_ul
def handle_ol
@list = :ol
@list_depth += 1
end
def handle_li
indent = " " * (@list_depth - 1)
mark = @list == :ul ? "* " : "#{@ol_count += 1} "
@capture_buffer += indent
@capture_buffer += mark
end
def end_li
@capture_buffer += "\n"
end
end
def self.convert(object)
document = HtmlDocument.new()
parser = Nokogiri::HTML::SAX::Parser.new(document)
parser.parse(object)
return document.captured
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment