Skip to content

Instantly share code, notes, and snippets.

@dblock
Last active August 29, 2015 14:06
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dblock/08ddcbf33b57daf14bb9 to your computer and use it in GitHub Desktop.
Save dblock/08ddcbf33b57daf14bb9 to your computer and use it in GitHub Desktop.
Convert HTML to markdown.
# https://github.com/cousine/downmark_it
#
# =Overview
# DownmarkIt is a library to convert HTML to markdown, based on Hpricot[http://github.com/hpricot/hpricot/].
#
# =Motivation
# While working on our company's new CMS, I needed to parse HTML back to markdown and surprisngly there wasn't any solution that could fit our enviroment, so I decided to make my own and share it :)
#
# =Usage
# Make sure you install Hpricot[http://github.com/hpricot/hpricot/] first, then require the library in your application, if you are using the library in a rails application, just place it in your lib folder, then use this method to convert HTML into markdown.
# markdown = DownmarkIt.to_markdown(html)
#
# =Features
# This library supports variable header tags, horizontal rulers, emphasis, strong, links, images, blockqoutes, code, unordered lists(nested) and ordered lists(nested)
#
# =WARNING
# Currently DownmarkIt does not support ul tags inside ol tags or vice versa, maybe in the future i will add it ;)
#
# =License
# This code is licensed under MIT License
require 'hpricot'
module DownmarkIt
# TODO: Add nested unordered lists inside ordered list and vice versa support
def self.to_markdown(html)
raw = Hpricot(html.gsub(/(\n|\r|\t)/, " "))
# headers
(raw / "/<h\d>/").each do |header|
if header.name.match(/^h\d$/)
header_level = header.name.match(/\d/).to_s.to_i
header.swap("#{"#" * header_level} #{header.inner_html}\n")
end
end
# horizontal rulers
(raw / "hr").each do |hruler|
hruler.swap("\n---\n")
end
# emphasis
%w(em i).each do |tag|
(raw / tag).each do |em|
em.swap("_#{em.inner_html}_") if em.name == tag
end
end
# strong
(raw / "strong").each do |strong|
strong.swap("**#{strong.inner_html}**") if strong.name == "strong"
end
# links (anchors)
(raw / "a").each do |anchor|
if anchor.name == "a"
if anchor.inner_html != ""
anchor.swap("[#{anchor.inner_html}](#{anchor['href']}#{" \"#{anchor['title']}\"" if anchor['title']})")
else
anchor.swap("<#{anchor['href']}>")
end
end
end
# image
(raw / "img").each do |image|
image.swap("![#{image['alt']}](#{image['src']}#{" \"#{image['title']}\"" if image['title']})")
end
# blockqoute
(raw / "blockqoute").each do |qoute|
qoute.swap("> #{nested_qoute(qoute)}") if qoute.name == "blockqoute"
end
# code
(raw / "code").each do |code|
code.swap("``#{code.inner_html}``") if code.name == "code"
end
# unordered list
(raw / "ul").each do |ul|
if ul.name == "ul"
(ul / ">li").each do |li|
if li.name == "li"
nli = nested_ul(li, 0)
if nli.match(/ - /)
li_inner = li.inner_text.match(/^\n/) ? "#{li.inner_text.gsub(/^\n/, "")}\n" : "- #{li.inner_text}\n"
li.swap("#{li_inner}")
else
li.swap("- #{nli}\n")
end
end
end
ul.swap("#{ul.inner_html}")
end
end
# ordered list
(raw / "ol").each do |ol|
if ol.name == "ol"
level = 0
(ol / ">li").each do |li|
if li.name == "li"
nli = nested_ol(li, 0)
if nli.match(/ \d+\. /)
li_inner = li.inner_text.match(/^\n/) ? "#{li.inner_text.gsub(/^\n/, "")}\n" : "#{level += 1 }. #{li.inner_text}\n"
li.swap("#{li_inner}")
else
li.swap("#{level += 1 }. #{nli}\n")
end
end
end
ol.swap("#{ol.inner_html}")
end
end
# lines
(raw / "p").each do |p|
p.swap("\n#{p.inner_text}\n") if p.name == "p"
end
# breaks
(raw / "br").each do |br|
br.swap(" \n")
end
raw.to_s
end
private
def self.nested_qoute(qoute)
if (nqoute = qoute.at("blockqoute"))
nnqoute = nested_qoute(nqoute)
"> #{nnqoute}"
else
qoute.inner_html
end
end
def self.nested_ul(li, level)
ul = li.at("ul")
if ul
nested_uli(ul, level + 1)
else
li.inner_html
end
end
def self.nested_uli(li, level)
nli = li.at("li")
if nli
(li / ">li").each do |cnli|
nnli = nested_ul(cnli, level + 1)
if nnli.match(/ - /)
inner_li = cnli.inner_text.match(/^\n/) ? "" : cnli.inner_text
cnli.swap "\n#{" " * level}- #{inner_li}" unless inner_li == ""
else
cnli.swap "\n#{" " * level}- #{nnli}"
end
end
li.inner_html
else
li.inner_html
end
end
def self.nested_ol(li, level)
ol = li.at("ol")
if ol
nested_oli(ol, level + 1)
else
li.inner_html
end
end
def self.nested_oli(li, level)
nli = li.at("li")
if nli
nlevel = 0
(li / ">li").each do |cnli|
nnli = nested_ol(cnli, level + 1)
if nnli.match(/ \d+. /)
inner_li = cnli.inner_text.match(/^\n/) ? "" : cnli.inner_text
cnli.swap "\n#{" " * level}#{nlevel += 1 }. #{inner_li}" unless inner_li == ""
else
cnli.swap "\n#{" " * level}#{nlevel += 1 }. #{nnli}"
end
end
li.inner_html
else
li.inner_html
end
end
end
module MarkdownStringExtension
def contains_html?
!!(self =~ /\<.*\>/)
end
def html_to_markdown
self.contains_html? ? DownmarkIt.to_markdown(MarkdownUtils.to_html(self)) : self
end
end
String.send(:include, MarkdownStringExtension)
module MarkdownUtils
def self.to_html(s)
return "" unless s
# the .strip call is necessary because Sanitize was inconsistent across machines
# (mine vs. everyone else's) about whether it left a trailing newline or not.
# -ALF, 6/12/13
Sanitize.clean(
RDiscount.new(s).to_html.encode("UTF-8", undef: :replace),
Sanitize::Config::RELAXED
).strip.html_safe
end
def self.to_text(s)
return "" unless s
Sanitize.clean(RDiscount.new(s).to_html.encode("UTF-8", undef: :replace)).strip
end
end
Sanitize::Config::RELAXED[:elements] << "hr"
Sanitize::Config::RELAXED[:elements] << "font"
Sanitize::Config::RELAXED[:attributes]['font'] = [ 'style', 'size', 'color' ]
# https://github.com/rgrove/sanitize/wiki/Transformer%3A-Remove-empty-elements
Sanitize::Transformers::REMOVE_EMPTY_TAGS = lambda { |env|
node = env[:node]
node_name = env[:node_name]
return unless node.elem? && %w(a b em i p strong).include?(node_name)
unless node.children.any? { |c| c.text? && c.content.strip.length > 0 || ! c.text? }
node.unlink
end
}
# https://github.com/rgrove/sanitize#example-transformer-to-whitelist-youtube-video-embeds
Sanitize::Transformers::ISOLATE_EXTERNAL_IFRAME = lambda { |env|
node = env[:node]
node_name = env[:node_name]
# Don't continue if this node is already whitelisted or is not an element.
return if env[:is_whitelisted] || !node.element?
# Don't continue unless the node is an iframe.
return unless node_name == 'iframe'
# We're now certain that this is a video embed, but we still need to run
# it through a special Sanitize step to ensure that no unwanted elements or
# attributes that don't belong in a video embed can sneak in.
Sanitize.clean_node!(node,
elements: %w(iframe),
attributes: {
'iframe' => %w(allowfullscreen frameborder height src width)
}
)
# Now that we're sure that this is a valid video embed and that there are
# no unwanted elements or attributes hidden inside it, we can tell Sanitize
# to whitelist the current node.
{ node_whitelist: [node] }
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment