Skip to content

Instantly share code, notes, and snippets.

@DrayChou
Created February 11, 2012 04:41
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save DrayChou/1796343 to your computer and use it in GitHub Desktop.
Save DrayChou/1796343 to your computer and use it in GitHub Desktop.
wordpress blog -> markdown
# coding: utf-8
require 'rubygems'
require 'hpricot'
require 'fileutils'
require 'time'
require 'ya2yaml'
require 'nokogiri'
require 'cgi'
require 'chinese_pinyin'
# =Overview
# DownmarkIt is a library to convert HTML to markdown, based on Hpricot[http://github.com/hpricot/hpricot/].
#
# =Motivation
# While working on our company's new CMS, I needed to parse HTML back to markdown and surprisngly there wasn't any solution that could fit our enviroment, so I decided to make my own and share it :)
#
# =Usage
# Make sure you install Hpricot[http://github.com/hpricot/hpricot/] first, then require the library in your application, if you are using the library in a rails application, just place it in your lib folder, then use this method to convert HTML into markdown.
# markdown = DownmarkIt.to_markdown(html)
#
# =Features
# This library supports variable header tags, horizontal rulers, emphasis, strong, links, images, blockqoutes, code, unordered lists(nested) and ordered lists(nested)
#
# =WARNING
# Currently DownmarkIt does not support ul tags inside ol tags or vice versa, maybe in the future i will add it ;)
#
# =License
# This code is licensed under MIT License
require 'Hpricot'
module DownmarkIt
# TODO: Add nested unordered lists inside ordered list and vice versa support
def self.to_markdown(html)
raw = Hpricot(html.gsub(/(\n|\r|\t)/, ""))
# headers
(raw/"/<h\d>/").each do |header|
if(header.name.match(/^h\d$/))
header_level = header.name.match(/\d/).to_s.to_i
header.swap("#{"#" * header_level} #{header.inner_html}\r\n")
end
end
# horizontal rulers
(raw/"hr").each do |hruler|
hruler.swap("\r\n---\r\n")
end
# emphasis
(raw/"em").each do |em|
if(em.name == "em")
em.swap("_#{em.inner_html}_")
end
end
# strong
(raw/"strong").each do |strong|
if(strong.name == "strong")
strong.swap("**#{strong.inner_html}**")
end
end
# links (anchors)
(raw/"a").each do |anchor|
if(anchor.name=="a")
if(anchor.inner_html != "")
anchor.swap("[#{anchor.inner_html}](#{anchor['href']}#{" \"#{anchor['title']}\"" if anchor['title']})")
else
anchor.swap("<#{anchor['href']}>")
end
end
end
# image
(raw/"img").each do |image|
image.swap("![#{image['alt']}](#{image['src']}#{" \"#{image['title']}\"" if image['title']})")
end
# blockqoute
(raw/"blockqoute").each do |qoute|
if qoute.name == "blockqoute"
qoute.swap("> #{nested_qoute(qoute)}")
end
end
# code
(raw/"code").each do |code|
if code.name == "code"
code.swap("``#{code.inner_html}``")
end
end
# unordered list
(raw/"ul").each do |ul|
if ul.name == "ul"
(ul/">li").each do |li|
if li.name == "li"
nli = nested_ul(li, 0)
if (nli.match(/ - /))
li_inner = (li.inner_text.match(/^\r\n/))?("#{li.inner_text.gsub(/^\r\n/, "")}\r\n"):("- #{li.inner_text}\r\n")
li.swap("#{li_inner}")
else
li.swap("- #{nli}\r\n")
end
end
end
ul.swap("#{ul.inner_html}")
end
end
# ordered list
(raw/"ol").each do |ol|
if ol.name == "ol"
level = 0
(ol/">li").each do |li|
if li.name == "li"
nli = nested_ol(li, 0)
if (nli.match(/ \d+\. /))
li_inner = (li.inner_text.match(/^\r\n/))?("#{li.inner_text.gsub(/^\r\n/, "")}\r\n"):("#{level+=1}. #{li.inner_text}\r\n")
li.swap("#{li_inner}")
else
li.swap("#{level+=1}. #{nli}\r\n")
end
end
end
ol.swap("#{ol.inner_html}")
end
end
# lines
(raw /"p").each do |p|
if p.name == "p"
p.swap("\r\n#{p.inner_text}\r\n")
end
end
return raw.to_s
end
private
def self.nested_qoute(qoute)
nqoute = qoute.at("blockqoute")
unless(nqoute.nil?)
nnqoute = nested_qoute(nqoute)
"> #{nnqoute}"
else
qoute.inner_html
end
end
def self.nested_ul(li, level)
ul = li.at("ul")
unless(ul.nil?)
nested_uli(ul, level + 1)
else
li.inner_html
end
end
def self.nested_uli(li, level)
nli = li.at("li")
unless(nli.nil?)
(li/">li").each do |cnli|
nnli = nested_ul(cnli, level + 1)
if (nnli.match(/ - /))
inner_li = (cnli.inner_text.match(/^\r\n/))?(""):(cnli.inner_text)
cnli.swap "\r\n#{" " * level}- #{inner_li}" unless inner_li == ""
else
cnli.swap "\r\n#{" " * level}- #{nnli}"
end
end
li.inner_html
else
li.inner_html
end
end
def self.nested_ol(li, level)
ol = li.at("ol")
unless(ol.nil?)
nested_oli(ol, level + 1)
else
li.inner_html
end
end
def self.nested_oli(li, level)
nli = li.at("li")
unless(nli.nil?)
nlevel = 0
(li/">li").each do |cnli|
nnli = nested_ol(cnli, level + 1)
if (nnli.match(/ \d+. /))
inner_li = (cnli.inner_text.match(/^\r\n/))?(""):(cnli.inner_text)
cnli.swap "\r\n#{" " * level}#{nlevel+=1}. #{inner_li}" unless inner_li == ""
else
cnli.swap "\r\n#{" " * level}#{nlevel+=1}. #{nnli}"
end
end
li.inner_html
else
li.inner_html
end
end
end
LANG_TABLE = {
'asm' => 'nasm',
'lisp' => 'cl',
}
def filter_html(html)
def remove_node(node)
node.children.each do |child|
node.add_previous_sibling child
end
node.remove
end
def underline_to_u(doc, node)
u = doc.create_element 'u'
node.children.each do |child|
u << child
end
node.add_previous_sibling u
node.remove
end
def pre_to_codeblock(doc, node)
lang = node['class'][/brush: (\w+)/, 1] rescue ''
lang = LANG_TABLE[lang] if LANG_TABLE.include? lang
u = doc.create_text_node "\n\n``` #{lang}\n#{node.text}\n```\n\n"
node.add_previous_sibling u
node.remove
end
html.sub!(/<\!--more--!>/, '')
doc = Nokogiri::HTML html
doc.css('p').each do |p|
remove_node p
end
doc.css('span').each do |span|
remove_node(span) if span['style'] and span['style'][/font-family/]
underline_to_u(doc, span) if span['style'] and span['style'][/underline/]
end
doc.css('pre').each do |pre|
pre_to_codeblock(doc, pre) if pre['class'] and pre['class'][/brush/]
end
CGI.unescapeHTML(doc.css('body').inner_html)
end
module Jekyll
# This importer takes a wordpress.xml file, which can be exported from your
# wordpress.com blog (/wp-admin/export.php).
module WordpressDotCom
def self.process(filename = "wordpress.xml")
import_count = Hash.new(0)
doc = Hpricot::XML(File.read(filename))
(doc/:channel/:item).each do |item|
title = item.at(:title).inner_text.strip
permalink_title = item.at('wp:post_name').inner_text
# Fallback to "prettified" title if post_name is empty (can happen)
if permalink_title == ""
permalink_title = title.downcase.split.join('-')
end
date = Time.parse(item.at('wp:post_date').inner_text)
status = item.at('wp:status').inner_text
if status == "publish"
published = true
else
published = false
end
type = item.at('wp:post_type').inner_text
tags = (item/:category).select{|c| c['domain'] == 'post_tag'}.map{|c| c.inner_text}.reject{|c| c == 'Uncategorized'}.uniq
categories = (item/:category).select{|c| c['domain'] == 'category'}.map{|c| c.inner_text}.reject{|c| c == 'Uncategorized'}.uniq
excerpt = item.at('excerpt:encoded').inner_text
#文章标题
if permalink_title != URI.decode(permalink_title)
permalink_title = Pinyin.t(URI.decode(permalink_title), '-')
permalink_title = permalink_title.gsub(/[,|?|—|:|!|~|。]/, "")
puts permalink_title
end
name = "#{date.strftime('%Y-%m-%d')}-#{permalink_title}.md"
header = {
'layout' => type,
'title' => title,
'tags' => tags,
'categories' => categories,
'status' => status,
'type' => type,
'published' => published,
'excerpt' => excerpt,
'comments' => true,
}.delete_if {|k,v| v.nil? || v == ''}
content = item.at('content:encoded').inner_text
[
/<\s*blockquote.*?>(.*?)<\/\s*?blockquote[^>\w]*?>/m,
/\[\s*cc.*?\](.*)\[\/cc\]/m,
/<\s*code.*?>(.*?)<\/\s*?code[^>\w]*?>/m,
/<\s*pre.*?>(.*?)<\/\s*?pre[^>\w]*?>/m,
/<\?php(.*)\?>/m
].each{ |z|
if m = z.match(content)
content = content.gsub(z, "\n{% codeblock %}\n#{m[1]}\n{% endcodeblock %}\n")
end
}
# convert <code></code> blocks to {% codeblock %}{% endcodeblock %}
#content = content.gsub(/<code>(.*?)<\/code>/, '`\1`')
#content = content.gsub(/<blockquote>/, '{% codeblock %}')
#content = content.gsub(/<\/blockquote>/, '{% endcodeblock %}')
# if m1= /<\s*blockquote.*?>(.*?)<\/\s*?blockquote[^>\w]*?>/.match(content)
# content.gsub(/<\s*blockquote.*?>(.*?)<\/\s*?blockquote[^>\w]*?>/, "\n{% codeblock %}\n#{m1[1]}\n{% endcodeblock %}\n")
# end
# convert [cc lang='php'][/cc] blocks to {% codeblock %}{% endcodeblock %}
# content = content.gsub(/\[cc lang='([^']*)' \]/, '{% codeblock %}')
# content = content.gsub(/\[\/cc\]/, '{% endcodeblock %}')
# z = /\[\s*cc.*?\](.*)\[\/cc\]/m
# if m= z.match(content)
# content.gsub(z, "\n{% codeblock %}\n#{m1[1]}\n{% endcodeblock %}\n")
# end
# convert <code></code> blocks to {% codeblock %}{% endcodeblock %}
#content = content.gsub(/<code>/, '{% codeblock %}')
#content = content.gsub(/<code .*?>/, '{% codeblock %}')
#content = content.gsub(/<\/code>/, '{% endcodeblock %}')
# if m1= /<\s*code.*?>(.*?)<\/\s*?code[^>\w]*?>/.match(content)
# content.gsub(/<\s*code.*?>(.*?)<\/\s*?code[^>\w]*?>/, "\n{% codeblock %}\n#{m1[1]}\n{% endcodeblock %}\n")
# end
# convert <pre></pre> blocks to {% codeblock %}{% encodeblock %}
#content = content.gsub(/<pre lang="([^"]*)">(.*?)<\/pre>/m, '`\1`')
#content = content.gsub(/<pre>/, '{% codeblock %}')
#content = content.gsub(/<pre lang="([^"]*)">/, '{% codeblock %}')
#content = content.gsub(/<\/pre>/m, '{% endcodeblock %}')
# if m1= /<\s*pre.*?>(.*?)<\/\s*?pre[^>\w]*?>/.match(content)
# content.gsub(/<\s*code.*?>(.*?)<\/\s*?code[^>\w]*?>/, "\n{% codeblock %}\n#{m1[1]}\n{% endcodeblock %}\n")
# end
# convert images to OctopressBlog
content = content.gsub(/http:\/\/zh-w\.info\/wp-content\/uploads/, '/images/uploads')
content = content.gsub(/\/wp-content\/uploads/, '/images/uploads')
FileUtils.mkdir_p "_#{type}s"
File.open("_#{type}s/#{name}", "w:utf-8") do |f|
f.puts header.ya2yaml
f.puts '---'
f.puts filter_html(content)
end
import_count[type] += 1
end
import_count.each do |key, value|
puts "Imported #{value} #{key}s"
end
end
end
end
Jekyll::WordpressDotCom.process(ARGV[0])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment