dblock/downmark_it.rb

## downmark_it.rb
# https://github.com/cousine/downmark_it
#
# =Overview
# DownmarkIt is a library to convert HTML to markdown, based on Hpricot[http://github.com/hpricot/hpricot/].
#
# =Motivation
# While working on our company's new CMS, I needed to parse HTML back to markdown and surprisngly there wasn't any solution that could fit our enviroment, so I decided to make my own and share it :)
#
# =Usage
# Make sure you install Hpricot[http://github.com/hpricot/hpricot/] first, then require the library in your application, if you are using the library in a rails application, just place it in your lib folder, then use this method to convert HTML into markdown.
#  markdown = DownmarkIt.to_markdown(html)
#
# =Features
# This library supports variable header tags, horizontal rulers, emphasis, strong, links, images, blockqoutes, code, unordered lists(nested) and ordered lists(nested)
#
# =WARNING
# Currently DownmarkIt does not support ul tags inside ol tags or vice versa, maybe in the future i will add it ;)
#
# =License
# This code is licensed under MIT License
require 'hpricot'

module DownmarkIt
  # TODO: Add nested unordered lists inside ordered list and vice versa support
  def self.to_markdown(html)
    raw = Hpricot(html.gsub(/(\n|\r|\t)/, " "))

    # headers
    (raw / "/<h\d>/").each do |header|
      if header.name.match(/^h\d$/)
        header_level = header.name.match(/\d/).to_s.to_i
        header.swap("#{"#" * header_level} #{header.inner_html}\n")
      end
    end

    # horizontal rulers
    (raw / "hr").each do |hruler|
      hruler.swap("\n---\n")
    end

    # emphasis
    %w(em i).each do |tag|
      (raw / tag).each do |em|
        em.swap("_#{em.inner_html}_") if em.name == tag
      end
    end

    # strong
    (raw / "strong").each do |strong|
      strong.swap("**#{strong.inner_html}**") if strong.name == "strong"
    end

    # links (anchors)
    (raw / "a").each do |anchor|
      if anchor.name == "a"
        if anchor.inner_html != ""
          anchor.swap("[#{anchor.inner_html}](#{anchor['href']}#{" \"#{anchor['title']}\"" if anchor['title']})")
        else
          anchor.swap("<#{anchor['href']}>")
        end
      end
    end

    # image
    (raw / "img").each do |image|
      image.swap("![#{image['alt']}](#{image['src']}#{" \"#{image['title']}\"" if image['title']})")
    end

    # blockqoute
    (raw / "blockqoute").each do |qoute|
      qoute.swap("> #{nested_qoute(qoute)}") if qoute.name == "blockqoute"
    end

    # code
    (raw / "code").each do |code|
      code.swap("``#{code.inner_html}``") if code.name == "code"
    end

    # unordered list
    (raw / "ul").each do |ul|
      if ul.name == "ul"
        (ul / ">li").each do |li|
          if li.name == "li"
            nli = nested_ul(li, 0)
            if nli.match(/ - /)
              li_inner = li.inner_text.match(/^\n/) ? "#{li.inner_text.gsub(/^\n/, "")}\n" : "- #{li.inner_text}\n"
              li.swap("#{li_inner}")
            else
              li.swap("- #{nli}\n")
            end
          end
        end
        ul.swap("#{ul.inner_html}")
      end
    end

    # ordered list
    (raw / "ol").each do |ol|
      if ol.name == "ol"
        level = 0
        (ol / ">li").each do |li|
          if li.name == "li"
            nli = nested_ol(li, 0)
            if nli.match(/ \d+\. /)
              li_inner = li.inner_text.match(/^\n/) ? "#{li.inner_text.gsub(/^\n/, "")}\n" : "#{level += 1 }. #{li.inner_text}\n"
              li.swap("#{li_inner}")
            else
              li.swap("#{level += 1 }. #{nli}\n")
            end
          end
        end
        ol.swap("#{ol.inner_html}")
      end
    end

    # lines
    (raw / "p").each do |p|
      p.swap("\n#{p.inner_text}\n") if p.name == "p"
    end

    # breaks
    (raw / "br").each do |br|
      br.swap("  \n")
    end

    raw.to_s
  end

  private

  def self.nested_qoute(qoute)
    if (nqoute = qoute.at("blockqoute"))
      nnqoute = nested_qoute(nqoute)
      "> #{nnqoute}"
    else
      qoute.inner_html
    end
  end

  def self.nested_ul(li, level)
    ul = li.at("ul")
    if ul
      nested_uli(ul, level + 1)
    else
      li.inner_html
    end
  end

  def self.nested_uli(li, level)
    nli = li.at("li")
    if nli
      (li / ">li").each do |cnli|
        nnli = nested_ul(cnli, level + 1)
        if nnli.match(/ - /)
          inner_li = cnli.inner_text.match(/^\n/) ? "" : cnli.inner_text
          cnli.swap "\n#{" " * level}- #{inner_li}" unless inner_li == ""
        else
          cnli.swap "\n#{" " * level}- #{nnli}"
        end
      end
      li.inner_html
    else
      li.inner_html
    end
  end

  def self.nested_ol(li, level)
    ol = li.at("ol")
    if ol
      nested_oli(ol, level + 1)
    else
      li.inner_html
    end
  end

  def self.nested_oli(li, level)
    nli = li.at("li")
    if nli
      nlevel = 0
      (li / ">li").each do |cnli|
        nnli = nested_ol(cnli, level + 1)
        if nnli.match(/ \d+. /)
          inner_li = cnli.inner_text.match(/^\n/) ? "" : cnli.inner_text
          cnli.swap "\n#{" " * level}#{nlevel += 1 }. #{inner_li}" unless inner_li == ""
        else
          cnli.swap "\n#{" " * level}#{nlevel += 1 }. #{nnli}"
        end
      end
      li.inner_html
    else
      li.inner_html
    end
  end
end

## markdown_string_extension.rb
module MarkdownStringExtension
  def contains_html?
    !!(self =~ /\<.*\>/)
  end

  def html_to_markdown
    self.contains_html? ? DownmarkIt.to_markdown(MarkdownUtils.to_html(self)) : self
  end
end

String.send(:include, MarkdownStringExtension)

## markdown_utils.rb
module MarkdownUtils
  def self.to_html(s)
    return "" unless s
    # the .strip call is necessary because Sanitize was inconsistent across machines
    # (mine vs. everyone else's) about whether it left a trailing newline or not.
    # -ALF, 6/12/13
    Sanitize.clean(
      RDiscount.new(s).to_html.encode("UTF-8", undef: :replace),
      Sanitize::Config::RELAXED
    ).strip.html_safe
  end
  def self.to_text(s)
    return "" unless s
    Sanitize.clean(RDiscount.new(s).to_html.encode("UTF-8", undef: :replace)).strip
  end
end

## sanitize.rb
Sanitize::Config::RELAXED[:elements] << "hr"
Sanitize::Config::RELAXED[:elements] << "font"
Sanitize::Config::RELAXED[:attributes]['font'] = [ 'style', 'size', 'color' ]

# https://github.com/rgrove/sanitize/wiki/Transformer%3A-Remove-empty-elements
Sanitize::Transformers::REMOVE_EMPTY_TAGS = lambda { |env|
  node = env[:node]
  node_name = env[:node_name]

  return unless node.elem? && %w(a b em i p strong).include?(node_name)

  unless node.children.any? { |c| c.text? && c.content.strip.length > 0 || ! c.text? }
    node.unlink
  end
}

# https://github.com/rgrove/sanitize#example-transformer-to-whitelist-youtube-video-embeds
Sanitize::Transformers::ISOLATE_EXTERNAL_IFRAME = lambda { |env|
  node      = env[:node]
  node_name = env[:node_name]

  # Don't continue if this node is already whitelisted or is not an element.
  return if env[:is_whitelisted] || !node.element?

  # Don't continue unless the node is an iframe.
  return unless node_name == 'iframe'

  # We're now certain that this is a video embed, but we still need to run
  # it through a special Sanitize step to ensure that no unwanted elements or
  # attributes that don't belong in a video embed can sneak in.
  Sanitize.clean_node!(node,
    elements: %w(iframe),
    attributes: {
      'iframe'  => %w(allowfullscreen frameborder height src width)
    }
  )

  # Now that we're sure that this is a valid video embed and that there are
  # no unwanted elements or attributes hidden inside it, we can tell Sanitize
  # to whitelist the current node.
  { node_whitelist: [node] }
}
	# https://github.com/cousine/downmark_it
	#
	# =Overview
	# DownmarkIt is a library to convert HTML to markdown, based on Hpricot[http://github.com/hpricot/hpricot/].
	#
	# =Motivation
	# While working on our company's new CMS, I needed to parse HTML back to markdown and surprisngly there wasn't any solution that could fit our enviroment, so I decided to make my own and share it :)
	#
	# =Usage
	# Make sure you install Hpricot[http://github.com/hpricot/hpricot/] first, then require the library in your application, if you are using the library in a rails application, just place it in your lib folder, then use this method to convert HTML into markdown.
	# markdown = DownmarkIt.to_markdown(html)
	#
	# =Features
	# This library supports variable header tags, horizontal rulers, emphasis, strong, links, images, blockqoutes, code, unordered lists(nested) and ordered lists(nested)
	#
	# =WARNING
	# Currently DownmarkIt does not support ul tags inside ol tags or vice versa, maybe in the future i will add it ;)
	#
	# =License
	# This code is licensed under MIT License
	require 'hpricot'

	module DownmarkIt
	# TODO: Add nested unordered lists inside ordered list and vice versa support
	def self.to_markdown(html)
	raw = Hpricot(html.gsub(/(\n\|\r\|\t)/, " "))

	# headers
	(raw / "/<h\d>/").each do \|header\|
	if header.name.match(/^h\d$/)
	header_level = header.name.match(/\d/).to_s.to_i
	header.swap("#{"#" * header_level} #{header.inner_html}\n")
	end
	end

	# horizontal rulers
	(raw / "hr").each do \|hruler\|
	hruler.swap("\n---\n")
	end

	# emphasis
	%w(em i).each do \|tag\|
	(raw / tag).each do \|em\|
	em.swap("_#{em.inner_html}_") if em.name == tag
	end
	end

	# strong
	(raw / "strong").each do \|strong\|
	strong.swap("#{strong.inner_html}") if strong.name == "strong"
	end

	# links (anchors)
	(raw / "a").each do \|anchor\|
	if anchor.name == "a"
	if anchor.inner_html != ""
	anchor.swap("[#{anchor.inner_html}](#{anchor['href']}#{" \"#{anchor['title']}\"" if anchor['title']})")
	else
	anchor.swap("<#{anchor['href']}>")
	end
	end
	end

	# image
	(raw / "img").each do \|image\|
	image.swap("![#{image['alt']}](#{image['src']}#{" \"#{image['title']}\"" if image['title']})")
	end

	# blockqoute
	(raw / "blockqoute").each do \|qoute\|
	qoute.swap("> #{nested_qoute(qoute)}") if qoute.name == "blockqoute"
	end

	# code
	(raw / "code").each do \|code\|
	code.swap("``#{code.inner_html}``") if code.name == "code"
	end

	# unordered list
	(raw / "ul").each do \|ul\|
	if ul.name == "ul"
	(ul / ">li").each do \|li\|
	if li.name == "li"
	nli = nested_ul(li, 0)
	if nli.match(/ - /)
	li_inner = li.inner_text.match(/^\n/) ? "#{li.inner_text.gsub(/^\n/, "")}\n" : "- #{li.inner_text}\n"
	li.swap("#{li_inner}")
	else
	li.swap("- #{nli}\n")
	end
	end
	end
	ul.swap("#{ul.inner_html}")
	end
	end

	# ordered list
	(raw / "ol").each do \|ol\|
	if ol.name == "ol"
	level = 0
	(ol / ">li").each do \|li\|
	if li.name == "li"
	nli = nested_ol(li, 0)
	if nli.match(/ \d+\. /)
	li_inner = li.inner_text.match(/^\n/) ? "#{li.inner_text.gsub(/^\n/, "")}\n" : "#{level += 1 }. #{li.inner_text}\n"
	li.swap("#{li_inner}")
	else
	li.swap("#{level += 1 }. #{nli}\n")
	end
	end
	end
	ol.swap("#{ol.inner_html}")
	end
	end

	# lines
	(raw / "p").each do \|p\|
	p.swap("\n#{p.inner_text}\n") if p.name == "p"
	end

	# breaks
	(raw / "br").each do \|br\|
	br.swap(" \n")
	end

	raw.to_s
	end

	private

	def self.nested_qoute(qoute)
	if (nqoute = qoute.at("blockqoute"))
	nnqoute = nested_qoute(nqoute)
	"> #{nnqoute}"
	else
	qoute.inner_html
	end
	end

	def self.nested_ul(li, level)
	ul = li.at("ul")
	if ul
	nested_uli(ul, level + 1)
	else
	li.inner_html
	end
	end

	def self.nested_uli(li, level)
	nli = li.at("li")
	if nli
	(li / ">li").each do \|cnli\|
	nnli = nested_ul(cnli, level + 1)
	if nnli.match(/ - /)
	inner_li = cnli.inner_text.match(/^\n/) ? "" : cnli.inner_text
	cnli.swap "\n#{" " * level}- #{inner_li}" unless inner_li == ""
	else
	cnli.swap "\n#{" " * level}- #{nnli}"
	end
	end
	li.inner_html
	else
	li.inner_html
	end
	end

	def self.nested_ol(li, level)
	ol = li.at("ol")
	if ol
	nested_oli(ol, level + 1)
	else
	li.inner_html
	end
	end

	def self.nested_oli(li, level)
	nli = li.at("li")
	if nli
	nlevel = 0
	(li / ">li").each do \|cnli\|
	nnli = nested_ol(cnli, level + 1)
	if nnli.match(/ \d+. /)
	inner_li = cnli.inner_text.match(/^\n/) ? "" : cnli.inner_text
	cnli.swap "\n#{" " * level}#{nlevel += 1 }. #{inner_li}" unless inner_li == ""
	else
	cnli.swap "\n#{" " * level}#{nlevel += 1 }. #{nnli}"
	end
	end
	li.inner_html
	else
	li.inner_html
	end
	end
	end
	module MarkdownStringExtension
	def contains_html?
	!!(self =~ /\<.*\>/)
	end

	def html_to_markdown
	self.contains_html? ? DownmarkIt.to_markdown(MarkdownUtils.to_html(self)) : self
	end
	end

	String.send(:include, MarkdownStringExtension)
	module MarkdownUtils
	def self.to_html(s)
	return "" unless s
	# the .strip call is necessary because Sanitize was inconsistent across machines
	# (mine vs. everyone else's) about whether it left a trailing newline or not.
	# -ALF, 6/12/13
	Sanitize.clean(
	RDiscount.new(s).to_html.encode("UTF-8", undef: :replace),
	Sanitize::Config::RELAXED
	).strip.html_safe
	end
	def self.to_text(s)
	return "" unless s
	Sanitize.clean(RDiscount.new(s).to_html.encode("UTF-8", undef: :replace)).strip
	end
	end
	Sanitize::Config::RELAXED[:elements] << "hr"
	Sanitize::Config::RELAXED[:elements] << "font"
	Sanitize::Config::RELAXED[:attributes]['font'] = [ 'style', 'size', 'color' ]

	# https://github.com/rgrove/sanitize/wiki/Transformer%3A-Remove-empty-elements
	Sanitize::Transformers::REMOVE_EMPTY_TAGS = lambda { \|env\|
	node = env[:node]
	node_name = env[:node_name]

	return unless node.elem? && %w(a b em i p strong).include?(node_name)

	unless node.children.any? { \|c\| c.text? && c.content.strip.length > 0 \|\| ! c.text? }
	node.unlink
	end
	}

	# https://github.com/rgrove/sanitize#example-transformer-to-whitelist-youtube-video-embeds
	Sanitize::Transformers::ISOLATE_EXTERNAL_IFRAME = lambda { \|env\|
	node = env[:node]
	node_name = env[:node_name]

	# Don't continue if this node is already whitelisted or is not an element.
	return if env[:is_whitelisted] \|\| !node.element?

	# Don't continue unless the node is an iframe.
	return unless node_name == 'iframe'

	# We're now certain that this is a video embed, but we still need to run
	# it through a special Sanitize step to ensure that no unwanted elements or
	# attributes that don't belong in a video embed can sneak in.
	Sanitize.clean_node!(node,
	elements: %w(iframe),
	attributes: {
	'iframe' => %w(allowfullscreen frameborder height src width)
	}
	)

	# Now that we're sure that this is a valid video embed and that there are
	# no unwanted elements or attributes hidden inside it, we can tell Sanitize
	# to whitelist the current node.
	{ node_whitelist: [node] }
	}