Skip to content

Instantly share code, notes, and snippets.

@premist
Created July 7, 2013 06:53
Show Gist options
  • Star 3 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save premist/5942593 to your computer and use it in GitHub Desktop.
Save premist/5942593 to your computer and use it in GitHub Desktop.
WordPress to Jekyll
# =Overview
# DownmarkIt is a library to convert HTML to markdown, based on Hpricot[http://github.com/hpricot/hpricot/].
#
# =Motivation
# While working on our company's new CMS, I needed to parse HTML back to markdown and surprisngly there wasn't any solution that could fit our enviroment, so I decided to make my own and share it :)
#
# =Usage
# Make sure you install Hpricot[http://github.com/hpricot/hpricot/] first, then require the library in your application, if you are using the library in a rails application, just place it in your lib folder, then use this method to convert HTML into markdown.
# markdown = DownmarkIt.to_markdown(html)
#
# =Features
# This library supports variable header tags, horizontal rulers, emphasis, strong, links, images, blockquotes, code, unordered lists(nested) and ordered lists(nested)
#
# =WARNING
# Currently DownmarkIt does not support ul tags inside ol tags or vice versa, maybe in the future i will add it ;)
#
# =License
# This code is licensed under MIT License
module DownmarkIt
# TODO: Add nested unordered lists inside ordered list and vice versa support
def self.to_markdown(html)
# raw = Hpricot(html.gsub(/(\n|\r|\t)/, ""))
# raw = Hpricot(html.gsub(/(\r\n)/, "\r\n\r\n"))
raw = Hpricot(html)
# headers
(raw/"/<h\d>/").each do |header|
if(header.name.match(/^h\d$/))
header_level = header.name.match(/\d/).to_s.to_i
header.swap("#{"#" * header_level} #{header.inner_html}\r\n")
end
end
# horizontal rulers
(raw/"hr").each do |hruler|
hruler.swap("\r\n---\r\n")
end
# emphasis
(raw/"em").each do |em|
if(em.name == "em")
em.swap("_#{em.inner_html}_")
end
end
# strong
(raw/"strong").each do |strong|
if(strong.name == "strong")
strong.swap("**#{strong.inner_html}**")
end
end
# links (anchors)
(raw/"a").each do |anchor|
if(anchor.name=="a")
if(anchor.inner_html != "")
anchor.swap("[#{anchor.inner_html}](#{anchor['href']}#{" \"#{anchor['title']}\"" if anchor['title']})")
else
anchor.swap("<#{anchor['href']}>")
end
end
end
# image
(raw/"img").each do |image|
image.swap("![#{image['alt']}](#{image['src']}#{" \"#{image['title']}\"" if image['title']})")
end
# blockquote
(raw/"blockquote").each do |qoute|
if qoute.name == "blockquote"
qoute.swap("> #{nested_qoute(qoute)}")
end
end
# code
(raw/"code").each do |code|
if code.name == "code"
code.swap("``#{code.inner_html}``")
end
end
# unordered list
(raw/"ul").each do |ul|
if ul.name == "ul"
(ul/">li").each do |li|
if li.name == "li"
nli = nested_ul(li, 0)
if (nli.match(/ - /))
li_inner = (li.inner_text.match(/^\r\n/))?("#{li.inner_text.gsub(/^\r\n/, "")}\r\n"):("- #{li.inner_text}\r\n")
li.swap("#{li_inner}")
else
li.swap("- #{nli}\r\n")
end
end
end
ul.swap("#{ul.inner_html}")
end
end
# ordered list
(raw/"ol").each do |ol|
if ol.name == "ol"
level = 0
(ol/">li").each do |li|
if li.name == "li"
nli = nested_ol(li, 0)
if (nli.match(/ \d+\. /))
li_inner = (li.inner_text.match(/^\r\n/))?("#{li.inner_text.gsub(/^\r\n/, "")}\r\n"):("#{level+=1}. #{li.inner_text}\r\n")
li.swap("#{li_inner}")
else
li.swap("#{level+=1}. #{nli}\r\n")
end
end
end
ol.swap("#{ol.inner_html}")
end
end
# lines
(raw/"p").each do |p|
if p.name == "p"
p.swap("\r\n#{p.inner_text}\r\n")
end
end
return raw.to_s
end
private
def self.nested_qoute(qoute)
nqoute = qoute.at("blockquote")
unless(nqoute.nil?)
nnqoute = nested_qoute(nqoute)
"> #{nnqoute}"
else
qoute.inner_html
end
end
def self.nested_ul(li, level)
ul = li.at("ul")
unless(ul.nil?)
nested_uli(ul, level + 1)
else
li.inner_html
end
end
def self.nested_uli(li, level)
nli = li.at("li")
unless(nli.nil?)
(li/">li").each do |cnli|
nnli = nested_ul(cnli, level + 1)
if (nnli.match(/ - /))
inner_li = (cnli.inner_text.match(/^\r\n/))?(""):(cnli.inner_text)
cnli.swap "\r\n#{" " * level}- #{inner_li}" unless inner_li == ""
else
cnli.swap "\r\n#{" " * level}- #{nnli}"
end
end
li.inner_html
else
li.inner_html
end
end
def self.nested_ol(li, level)
ol = li.at("ol")
unless(ol.nil?)
nested_oli(ol, level + 1)
else
li.inner_html
end
end
def self.nested_oli(li, level)
nli = li.at("li")
unless(nli.nil?)
nlevel = 0
(li/">li").each do |cnli|
nnli = nested_ol(cnli, level + 1)
if (nnli.match(/ \d+. /))
inner_li = (cnli.inner_text.match(/^\r\n/))?(""):(cnli.inner_text)
cnli.swap "\r\n#{" " * level}#{nlevel+=1}. #{inner_li}" unless inner_li == ""
else
cnli.swap "\r\n#{" " * level}#{nlevel+=1}. #{nnli}"
end
end
li.inner_html
else
li.inner_html
end
end
end
# coding: utf-8
# WXR Importer by Minku Lee <premist@me.com>
# Licensed under MIT License
require 'rubygems'
require 'hpricot'
require 'fileutils'
require 'json'
require 'time'
load 'importer/downmark_it.rb'
module WP
def self.process(filename = "wordpress.xml")
import_count = 0
doc = Hpricot::XML(File.read(filename))
(doc/:channel/:item).each do |item|
title = item.at(:title).inner_text.strip
permalink_title = item.at('wp:post_name').inner_text
if permalink_title == ""
permalink_title = title.downcase.split.join('-')
end
date = Time.parse(item.at('wp:post_date').inner_text)
status = item.at('wp:status').inner_text
if status == "publish"
published = true
else
published = false
end
type = item.at("wp:post_type").inner_text
tags = (item/:category).map{|c| c.inner_text}.reject{|c| c == 'Uncategorized.'}.uniq
metas = Hash.new
item.search("wp:postmeta").each do |meta|
key = meta.at('wp:meta_key').inner_text
value = meta.at('wp:meta_value').inner_text
metas[key] = value;
end
unless status == "draft" || title.match("비밀글")
markdowned = DownmarkIt.to_markdown(item.at('content:encoded').inner_text)
@post = Post.new do |post|
post.title = title
# post.content = item.at('content:encoded').inner_text
post.content = markdowned
post.url = permalink_title
end
@post.save!
import_count += 1
end
end
puts "Imported #{import_count} post(s)"
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment