Skip to content

Instantly share code, notes, and snippets.

@jcoglan jcoglan/html2md.rb
Last active Aug 29, 2015

Embed
What would you like to do?
require 'nokogiri'
class Html2Md
class Link < Struct.new(:href, :title)
end
class Format < Struct.new(:name, :head, :body, :opened)
end
WIDTH = 80
def initialize(markup)
@markup = markup
@doc = Nokogiri::HTML.fragment(markup)
@width = WIDTH
end
def to_markdown
@buffer = ''
@stack = []
@links = []
traverse(@doc)
@links.each_with_index do |link, i|
@buffer << "[#{i+1}]: #{link.href}"
@buffer << " (#{link.title})" if link.title
@buffer << "\n"
end
@buffer
end
def traverse(doc)
doc.children.each { |node| visit(node) }
end
def visit(node)
__send__ "visit_#{node.name}", node
end
(1..6).each do |n|
class_eval %Q{
def visit_h#{n}(node)
prefix = '#' * #{n} + ' '
@stack << Format.new('h#{n}', prefix, prefix, false)
@buffer << format_block(node)
@stack.pop
newlines(1)
end
}
end
def visit_p(node)
@stack << Format.new('p', '', '', false)
@buffer << format_block(node)
@stack.pop
newlines(1)
end
def visit_pre(node)
@block = @pre = ''
traverse(node)
if node.inner_html =~ /<span /
@buffer << "<pre>"
@buffer << @block
@buffer << "</pre>\n\n"
else
@buffer << "```\n"
@buffer << @block
@buffer << "\n```\n\n"
end
@block = @pre = nil
end
def visit_code(node)
@code = true
result = @pre ? traverse(node) : visit_tt(node)
@code = false
result
end
def visit_blockquote(node)
@stack << Format.new('blockquote', '> ', '> ', false)
traverse(node)
@stack.pop
newlines(1)
end
def visit_cite(node)
@block << '<cite>'
traverse(node)
@block << '</cite>'
end
def visit_iframe(node)
@buffer << node.to_html
newlines(2)
end
alias :visit_object :visit_iframe
alias :visit_style :visit_iframe
def visit_ul(node)
traverse(node)
newlines(1)
end
def visit_ol(node)
@list_index = 1
traverse(node)
@list_index = nil
newlines(1)
end
def visit_li(node)
leader = @list_index ? "#{@list_index}. " : '- '
@stack << Format.new('li', leader, ' ', false)
@list_index += 1 if @list_index
@buffer << format_block(node)
@stack.pop
end
def visit_a(node)
@links << Link.new(node['href'], node['title'])
@block << "["
traverse(node)
@block << "][#{@links.size}]"
end
def visit_img(node)
@links << Link.new(node['src'], node['title'])
@block << "!["
@block << (node['alt'] || '')
@block << "][#{@links.size}]"
end
def visit_em(node)
delim = node.text =~ /\*/ ? '_' : '*'
@block << delim
traverse(node)
@block << delim
end
def visit_strong(node)
delim = node.text =~ /\*/ ? '__' : '**'
@block << delim
traverse(node)
@block << delim
end
def visit_i(node)
@block << '<i>'
traverse(node)
@block << '</i>'
end
def visit_b(node)
@block << '<b>'
traverse(node)
@block << '</b>'
end
def visit_del(node)
@block << '<del>'
traverse(node)
@block << '</del>'
end
def visit_ins(node)
@block << '<ins>'
traverse(node)
@block << '</ins>'
end
def visit_tt(node)
delim = node.text =~ /`/ ? '``' : '`'
@block << delim
@code = true
traverse(node)
@code = false
@block << delim
end
def visit_br(node)
@block << "\n"
end
def visit_span(node)
@block << %Q{<span class="#{node['class']}">}
traverse(node)
@block << '</span>'
end
def visit_text(node)
text = @pre ? node.text : node.text.gsub(/[ \n]+/, ' ').gsub(' - ', ' -- ')
@block << text if @block
end
def format_block(node)
@block = ''
traverse(node)
return '' if @block.nil?
result = @block.strip + ' '
@block = nil
width = @width - @stack.inject(0) { |s,f| s + f.body.length }
lines = result.scan(/.{1,#{width}}(?: +|\n)/).map do |line|
prefix = @stack.map { |s| s.opened ? s.body : s.head }
@stack.each { |s| s.opened = true }
prefix.join('') + line.gsub(/ *$/, '').gsub(/\n$/, ' ')
end
(lines + ['']).join("\n")
end
def newlines(n)
prefix = @stack.map { |s| s.body }.join('')
n.times { @buffer << prefix + "\n" }
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.