Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
WordPress to Jekyll
# =Overview
# DownmarkIt is a library to convert HTML to markdown, based on Hpricot[http://github.com/hpricot/hpricot/].
#
# =Motivation
# While working on our company's new CMS, I needed to parse HTML back to markdown and surprisngly there wasn't any solution that could fit our enviroment, so I decided to make my own and share it :)
#
# =Usage
# Make sure you install Hpricot[http://github.com/hpricot/hpricot/] first, then require the library in your application, if you are using the library in a rails application, just place it in your lib folder, then use this method to convert HTML into markdown.
# markdown = DownmarkIt.to_markdown(html)
#
# =Features
# This library supports variable header tags, horizontal rulers, emphasis, strong, links, images, blockquotes, code, unordered lists(nested) and ordered lists(nested)
#
# =WARNING
# Currently DownmarkIt does not support ul tags inside ol tags or vice versa, maybe in the future i will add it ;)
#
# =License
# This code is licensed under MIT License
module DownmarkIt
# TODO: Add nested unordered lists inside ordered list and vice versa support
def self.to_markdown(html)
# raw = Hpricot(html.gsub(/(\n|\r|\t)/, ""))
# raw = Hpricot(html.gsub(/(\r\n)/, "\r\n\r\n"))
raw = Hpricot(html)
# headers
(raw/"/<h\d>/").each do |header|
if(header.name.match(/^h\d$/))
header_level = header.name.match(/\d/).to_s.to_i
header.swap("#{"#" * header_level} #{header.inner_html}\r\n")
end
end
# horizontal rulers
(raw/"hr").each do |hruler|
hruler.swap("\r\n---\r\n")
end
# emphasis
(raw/"em").each do |em|
if(em.name == "em")
em.swap("_#{em.inner_html}_")
end
end
# strong
(raw/"strong").each do |strong|
if(strong.name == "strong")
strong.swap("**#{strong.inner_html}**")
end
end
# links (anchors)
(raw/"a").each do |anchor|
if(anchor.name=="a")
if(anchor.inner_html != "")
anchor.swap("[#{anchor.inner_html}](#{anchor['href']}#{" \"#{anchor['title']}\"" if anchor['title']})")
else
anchor.swap("<#{anchor['href']}>")
end
end
end
# image
(raw/"img").each do |image|
image.swap("![#{image['alt']}](#{image['src']}#{" \"#{image['title']}\"" if image['title']})")
end
# blockquote
(raw/"blockquote").each do |qoute|
if qoute.name == "blockquote"
qoute.swap("> #{nested_qoute(qoute)}")
end
end
# code
(raw/"code").each do |code|
if code.name == "code"
code.swap("``#{code.inner_html}``")
end
end
# unordered list
(raw/"ul").each do |ul|
if ul.name == "ul"
(ul/">li").each do |li|
if li.name == "li"
nli = nested_ul(li, 0)
if (nli.match(/ - /))
li_inner = (li.inner_text.match(/^\r\n/))?("#{li.inner_text.gsub(/^\r\n/, "")}\r\n"):("- #{li.inner_text}\r\n")
li.swap("#{li_inner}")
else
li.swap("- #{nli}\r\n")
end
end
end
ul.swap("#{ul.inner_html}")
end
end
# ordered list
(raw/"ol").each do |ol|
if ol.name == "ol"
level = 0
(ol/">li").each do |li|
if li.name == "li"
nli = nested_ol(li, 0)
if (nli.match(/ \d+\. /))
li_inner = (li.inner_text.match(/^\r\n/))?("#{li.inner_text.gsub(/^\r\n/, "")}\r\n"):("#{level+=1}. #{li.inner_text}\r\n")
li.swap("#{li_inner}")
else
li.swap("#{level+=1}. #{nli}\r\n")
end
end
end
ol.swap("#{ol.inner_html}")
end
end
# lines
(raw/"p").each do |p|
if p.name == "p"
p.swap("\r\n#{p.inner_text}\r\n")
end
end
return raw.to_s
end
private
def self.nested_qoute(qoute)
nqoute = qoute.at("blockquote")
unless(nqoute.nil?)
nnqoute = nested_qoute(nqoute)
"> #{nnqoute}"
else
qoute.inner_html
end
end
def self.nested_ul(li, level)
ul = li.at("ul")
unless(ul.nil?)
nested_uli(ul, level + 1)
else
li.inner_html
end
end
def self.nested_uli(li, level)
nli = li.at("li")
unless(nli.nil?)
(li/">li").each do |cnli|
nnli = nested_ul(cnli, level + 1)
if (nnli.match(/ - /))
inner_li = (cnli.inner_text.match(/^\r\n/))?(""):(cnli.inner_text)
cnli.swap "\r\n#{" " * level}- #{inner_li}" unless inner_li == ""
else
cnli.swap "\r\n#{" " * level}- #{nnli}"
end
end
li.inner_html
else
li.inner_html
end
end
def self.nested_ol(li, level)
ol = li.at("ol")
unless(ol.nil?)
nested_oli(ol, level + 1)
else
li.inner_html
end
end
def self.nested_oli(li, level)
nli = li.at("li")
unless(nli.nil?)
nlevel = 0
(li/">li").each do |cnli|
nnli = nested_ol(cnli, level + 1)
if (nnli.match(/ \d+. /))
inner_li = (cnli.inner_text.match(/^\r\n/))?(""):(cnli.inner_text)
cnli.swap "\r\n#{" " * level}#{nlevel+=1}. #{inner_li}" unless inner_li == ""
else
cnli.swap "\r\n#{" " * level}#{nlevel+=1}. #{nnli}"
end
end
li.inner_html
else
li.inner_html
end
end
end
# coding: utf-8
# WXR Importer by Minku Lee <premist@me.com>
# Licensed under MIT License
require 'rubygems'
require 'hpricot'
require 'fileutils'
require 'json'
require 'time'
load 'importer/downmark_it.rb'
module WP
def self.process(filename = "wordpress.xml")
import_count = 0
doc = Hpricot::XML(File.read(filename))
(doc/:channel/:item).each do |item|
title = item.at(:title).inner_text.strip
permalink_title = item.at('wp:post_name').inner_text
if permalink_title == ""
permalink_title = title.downcase.split.join('-')
end
date = Time.parse(item.at('wp:post_date').inner_text)
status = item.at('wp:status').inner_text
if status == "publish"
published = true
else
published = false
end
type = item.at("wp:post_type").inner_text
tags = (item/:category).map{|c| c.inner_text}.reject{|c| c == 'Uncategorized.'}.uniq
metas = Hash.new
item.search("wp:postmeta").each do |meta|
key = meta.at('wp:meta_key').inner_text
value = meta.at('wp:meta_value').inner_text
metas[key] = value;
end
unless status == "draft" || title.match("비밀글")
markdowned = DownmarkIt.to_markdown(item.at('content:encoded').inner_text)
@post = Post.new do |post|
post.title = title
# post.content = item.at('content:encoded').inner_text
post.content = markdowned
post.url = permalink_title
end
@post.save!
import_count += 1
end
end
puts "Imported #{import_count} post(s)"
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.
You signed in with another tab or window. Reload to refresh your session. You signed out in another tab or window. Reload to refresh your session.