Skip to content

Instantly share code, notes, and snippets.

@petrblaho
Created November 1, 2010 09:00
Show Gist options
  • Save petrblaho/657856 to your computer and use it in GitHub Desktop.
Save petrblaho/657856 to your computer and use it in GitHub Desktop.
require 'cgi'
def html2text html
text = html.
gsub(/( |\n|\s)+/im, ' ').squeeze(' ').strip.
gsub(/<([^\s]+)[^>]*(src|href)=\s*(.?)([^>\s]*)\3[^>]*>\4<\/\1>/i,
'\4')
links = []
linkregex = /<[^>]*(src|href)=\s*(.?)([^>\s]*)\2[^>]*>\s*/i
while linkregex.match(text)
links << $~[3]
text.sub!(linkregex, "[#{links.size}]")
end
text = CGI.unescapeHTML(
text.
gsub(/<(script|style)[^>]*>.*<\/\1>/im, '').
gsub(/<!--.*-->/m, '').
gsub(/<hr(| [^>]*)>/i, "___\n").
gsub(/<li(| [^>]*)>/i, "\n* ").
gsub(/<blockquote(| [^>]*)>/i, '> ').
gsub(/<(br)(| [^>]*)>/i, "\n").
gsub(/<(\/h[\d]+|p)(| [^>]*)>/i, "\n\n").
gsub(/<[^>]*>/, '')
).lstrip.gsub(/\n[ ]+/, "\n") + "\n"
for i in (0...links.size).to_a
text = text + "\n [#{i+1}] <#{CGI.unescapeHTML(links[i])}>" unless
links[i].nil?
end
links = nil
text
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment