Skip to content

Instantly share code, notes, and snippets.

@iamdustan
Created July 9, 2014 19:39
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save iamdustan/3de22984fb28a18fffa5 to your computer and use it in GitHub Desktop.
Save iamdustan/3de22984fb28a18fffa5 to your computer and use it in GitHub Desktop.
# Input: WordPress XML export file.
# Outputs: a series of Markdown files ready to be included in a middleman site
require 'pry'
require 'rubygems'
require 'nokogiri'
require 'upmark'
require 'html2md'
require 'time'
require 'fileutils'
require 'active_support/core_ext/hash/conversions'
# SETTINGS #
WORDPRESS_XML_FILE_PATH = File.join(ENV['PWD'], '..', '..', '/skookumdigitalworks.wordpress.2014-06-13.xml') # THE LOCATION OF THE EXPORTED WORDPRESS ARCHIVE #
OUTPUT_PATH = File.join(ENV['PWD'], '..', '..', 'source', 'blog') # THE LOCATION OF THE SAVED POSTS #
ORIGINAL_DOMAIN = "http://www.skookum.com" # THE DOMAIN OF THE WEBSITE #
class Parser
def self.make_output_path
unless File.directory?(OUTPUT_PATH)
FileUtils.mkdir_p(OUTPUT_PATH)
puts "Saving all files in" + OUTPUT_PATH.to_s
end
end
def self.xml_to_hash
f = File.open(WORDPRESS_XML_FILE_PATH)
xml = Nokogiri::XML(f)
hash = Hash.from_xml(xml.to_s)
authors = hash['rss']['channel']['author']
posts = hash['rss']['channel']['item']
posts.each do |post|
next if post['status'] != 'publish'
# Parsing Post Frontmatter
# ------------------------------------
title = post['title']
output_filename = post['link'].split('/').last
title.gsub!(":", "-")
post_date = post['post_date']
post_date_gmt = post['post_date_gmt']
created_at = Date.parse(post_date).to_s
author = authors.find {|a| a['author_login'] == post['creator']}
author = "#{author['author_first_name']} #{author['author_last_name']}"
layout = "blog"
tags = post['category']
if tags.is_a?(String)
tags = [tags]
end
# Parsing Post Content
# ------------------------------------
# content = post.at_xpath(".//content:encoded").to_s
content = post['encoded'].join("\n\n")
# also with /username/id
content.gsub!(/^https?:\/\/gist.github.com\/(?<username>\w+\/)?\/?(?<id>\d+)/) do |match|
out = "<script src=\"https://gist.github.com/#{$~[:username]}#{$~[:id]}.js\"></script>"
#puts out
out
end
# also with no ""
content.gsub!(/\[gist id="?(\d+)"? file="?([^"]*)"?\]/) do |match|
out = "<script src=\"https://gist.github.com/#{$1}.js?file=#{$2}\"></script>"
#puts out
out
end
content.gsub!(/^\[gallery link="[^\"]*"\]/, "")
content.gsub!(/^https?:\/\/vimeo.com\/(\d+)/) do |match|
out = iframe("//player.vimeo.com/video/#{$1}")
#puts out
out
end
content.gsub!(/https?:\/\/(www\.)?youtu(\.)?be.com\/(watch\?v=)*(?<id>\w+)/) do |match|
out = iframe("//www.youtube.com/embed/#{$1}")
#puts out
out
end
content.gsub!(/\[caption [^\]]*\]([^\[]*)\[\/caption\]/) do |match|
image = $1
# caption text from inline code
# Example:
# [caption id="" align="alignnone" width="470" caption="All your #BarCampCLT are belong to us"]
# <img src="http://farm5.static.flickr.com/4147/5069470777_bf68feffbd.jpg" alt="" width="470" height="352" />
# [/caption]
caption = /caption="([^\"]*)/.match(match)
if caption
caption = $1
# otherwise it's embedded in an anchor with the caption text inline
# Example:
# <a href="http://path-to-image.com/wp-content/paths/resource.jpg">
# <img src="http://path-to-image.com/wp-content/paths/resource.jpg" alt="Caption text" ...attributes />
# Caption text
# </a>
else
caption = /alt="([^\"]*)/.match(match)
caption = $1
image = /<img ([^\>]*)>/.match(match)
end
out = figure(image, caption)
#puts out
out
end
if !(created_at.nil? || title.nil? || post_date.nil? || content.nil?)
output_filename = File.join(OUTPUT_PATH, created_at + "-" + output_filename + ".html.markdown")
#puts output_filename
file_content = "---" + "\n"
file_content += "title: " + title + "\n"
file_content += "date: " + post_date + "\n"
file_content += "date_gmt: " + post_date_gmt + "\n"
file_content += "authors: " + author + "\n"
file_content += "tags:\n" +
' - ' + tags.join("\n - ") + "\n"
file_content += "layout: " + layout + "\n"
file_content += "---" + "\n"
file_content += content
# Saving File
# ------------------------------------
File.open(output_filename, "w") do |f|
f.write(file_content)
end
end
end
end
def self.sanitize_filename(filename)
filename
.downcase
.gsub(/[^\w\s_-]+/, '')
.gsub(/(^|\b\s)\s+($|\s?\b)/, '\\1\\2')
.gsub(/\s+/, '-')
.gsub(/[-]+/, '-')
end
private
def self.iframe(src)
"<iframe src=\"#{src}\" width=\"500\" height=\"281\" frameborder=\"0\" allowfullscreen></iframe>"
end
def self.figure(image, caption)
# do terrible things to get output markdown friendly-ish
"<figure>\n#{image}\n<figcaption>#{caption}\n</figcaption>\n</figure>"
end
end
Parser.make_output_path
Parser.xml_to_hash
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment