Created
July 9, 2014 19:39
-
-
Save iamdustan/3de22984fb28a18fffa5 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Input: WordPress XML export file. | |
# Outputs: a series of Markdown files ready to be included in a middleman site | |
require 'pry' | |
require 'rubygems' | |
require 'nokogiri' | |
require 'upmark' | |
require 'html2md' | |
require 'time' | |
require 'fileutils' | |
require 'active_support/core_ext/hash/conversions' | |
# SETTINGS # | |
WORDPRESS_XML_FILE_PATH = File.join(ENV['PWD'], '..', '..', '/skookumdigitalworks.wordpress.2014-06-13.xml') # THE LOCATION OF THE EXPORTED WORDPRESS ARCHIVE # | |
OUTPUT_PATH = File.join(ENV['PWD'], '..', '..', 'source', 'blog') # THE LOCATION OF THE SAVED POSTS # | |
ORIGINAL_DOMAIN = "http://www.skookum.com" # THE DOMAIN OF THE WEBSITE # | |
class Parser | |
def self.make_output_path | |
unless File.directory?(OUTPUT_PATH) | |
FileUtils.mkdir_p(OUTPUT_PATH) | |
puts "Saving all files in" + OUTPUT_PATH.to_s | |
end | |
end | |
def self.xml_to_hash | |
f = File.open(WORDPRESS_XML_FILE_PATH) | |
xml = Nokogiri::XML(f) | |
hash = Hash.from_xml(xml.to_s) | |
authors = hash['rss']['channel']['author'] | |
posts = hash['rss']['channel']['item'] | |
posts.each do |post| | |
next if post['status'] != 'publish' | |
# Parsing Post Frontmatter | |
# ------------------------------------ | |
title = post['title'] | |
output_filename = post['link'].split('/').last | |
title.gsub!(":", "-") | |
post_date = post['post_date'] | |
post_date_gmt = post['post_date_gmt'] | |
created_at = Date.parse(post_date).to_s | |
author = authors.find {|a| a['author_login'] == post['creator']} | |
author = "#{author['author_first_name']} #{author['author_last_name']}" | |
layout = "blog" | |
tags = post['category'] | |
if tags.is_a?(String) | |
tags = [tags] | |
end | |
# Parsing Post Content | |
# ------------------------------------ | |
# content = post.at_xpath(".//content:encoded").to_s | |
content = post['encoded'].join("\n\n") | |
# also with /username/id | |
content.gsub!(/^https?:\/\/gist.github.com\/(?<username>\w+\/)?\/?(?<id>\d+)/) do |match| | |
out = "<script src=\"https://gist.github.com/#{$~[:username]}#{$~[:id]}.js\"></script>" | |
#puts out | |
out | |
end | |
# also with no "" | |
content.gsub!(/\[gist id="?(\d+)"? file="?([^"]*)"?\]/) do |match| | |
out = "<script src=\"https://gist.github.com/#{$1}.js?file=#{$2}\"></script>" | |
#puts out | |
out | |
end | |
content.gsub!(/^\[gallery link="[^\"]*"\]/, "") | |
content.gsub!(/^https?:\/\/vimeo.com\/(\d+)/) do |match| | |
out = iframe("//player.vimeo.com/video/#{$1}") | |
#puts out | |
out | |
end | |
content.gsub!(/https?:\/\/(www\.)?youtu(\.)?be.com\/(watch\?v=)*(?<id>\w+)/) do |match| | |
out = iframe("//www.youtube.com/embed/#{$1}") | |
#puts out | |
out | |
end | |
content.gsub!(/\[caption [^\]]*\]([^\[]*)\[\/caption\]/) do |match| | |
image = $1 | |
# caption text from inline code | |
# Example: | |
# [caption id="" align="alignnone" width="470" caption="All your #BarCampCLT are belong to us"] | |
# <img src="http://farm5.static.flickr.com/4147/5069470777_bf68feffbd.jpg" alt="" width="470" height="352" /> | |
# [/caption] | |
caption = /caption="([^\"]*)/.match(match) | |
if caption | |
caption = $1 | |
# otherwise it's embedded in an anchor with the caption text inline | |
# Example: | |
# <a href="http://path-to-image.com/wp-content/paths/resource.jpg"> | |
# <img src="http://path-to-image.com/wp-content/paths/resource.jpg" alt="Caption text" ...attributes /> | |
# Caption text | |
# </a> | |
else | |
caption = /alt="([^\"]*)/.match(match) | |
caption = $1 | |
image = /<img ([^\>]*)>/.match(match) | |
end | |
out = figure(image, caption) | |
#puts out | |
out | |
end | |
if !(created_at.nil? || title.nil? || post_date.nil? || content.nil?) | |
output_filename = File.join(OUTPUT_PATH, created_at + "-" + output_filename + ".html.markdown") | |
#puts output_filename | |
file_content = "---" + "\n" | |
file_content += "title: " + title + "\n" | |
file_content += "date: " + post_date + "\n" | |
file_content += "date_gmt: " + post_date_gmt + "\n" | |
file_content += "authors: " + author + "\n" | |
file_content += "tags:\n" + | |
' - ' + tags.join("\n - ") + "\n" | |
file_content += "layout: " + layout + "\n" | |
file_content += "---" + "\n" | |
file_content += content | |
# Saving File | |
# ------------------------------------ | |
File.open(output_filename, "w") do |f| | |
f.write(file_content) | |
end | |
end | |
end | |
end | |
def self.sanitize_filename(filename) | |
filename | |
.downcase | |
.gsub(/[^\w\s_-]+/, '') | |
.gsub(/(^|\b\s)\s+($|\s?\b)/, '\\1\\2') | |
.gsub(/\s+/, '-') | |
.gsub(/[-]+/, '-') | |
end | |
private | |
def self.iframe(src) | |
"<iframe src=\"#{src}\" width=\"500\" height=\"281\" frameborder=\"0\" allowfullscreen></iframe>" | |
end | |
def self.figure(image, caption) | |
# do terrible things to get output markdown friendly-ish | |
"<figure>\n#{image}\n<figcaption>#{caption}\n</figcaption>\n</figure>" | |
end | |
end | |
Parser.make_output_path | |
Parser.xml_to_hash | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment