iamdustan/wordpress_to_middleman.rb

## wordpress_to_middleman.rb

# Input: WordPress XML export file.
# Outputs: a series of Markdown files ready to be included in a middleman site

require 'pry'

require 'rubygems'
require 'nokogiri'
require 'upmark'
require 'html2md'
require 'time'
require 'fileutils'
require 'active_support/core_ext/hash/conversions'

# SETTINGS #
WORDPRESS_XML_FILE_PATH = File.join(ENV['PWD'], '..', '..', '/skookumdigitalworks.wordpress.2014-06-13.xml')  # THE LOCATION OF THE EXPORTED WORDPRESS ARCHIVE #
OUTPUT_PATH = File.join(ENV['PWD'], '..', '..', 'source', 'blog')  # THE LOCATION OF THE SAVED POSTS #
ORIGINAL_DOMAIN = "http://www.skookum.com"  #  THE DOMAIN OF THE WEBSITE #

class Parser

  def self.make_output_path
    unless File.directory?(OUTPUT_PATH)
      FileUtils.mkdir_p(OUTPUT_PATH)
      puts "Saving all files in" + OUTPUT_PATH.to_s
    end
  end

  def self.xml_to_hash
    f = File.open(WORDPRESS_XML_FILE_PATH)
    xml = Nokogiri::XML(f)
    hash = Hash.from_xml(xml.to_s)
    authors = hash['rss']['channel']['author']
    posts = hash['rss']['channel']['item']

    posts.each do |post|
      next if post['status'] != 'publish'

      # Parsing Post Frontmatter
      # ------------------------------------
      title = post['title']
      output_filename = post['link'].split('/').last
      title.gsub!(":", "-")
      post_date = post['post_date']
      post_date_gmt = post['post_date_gmt']
      created_at = Date.parse(post_date).to_s
      author = authors.find {|a| a['author_login'] == post['creator']}
      author = "#{author['author_first_name']} #{author['author_last_name']}"
      layout = "blog"

      tags = post['category']
      if tags.is_a?(String)
        tags = [tags]
      end

      # Parsing Post Content
      # ------------------------------------
      # content = post.at_xpath(".//content:encoded").to_s
      content = post['encoded'].join("\n\n")

      # also with /username/id
      content.gsub!(/^https?:\/\/gist.github.com\/(?<username>\w+\/)?\/?(?<id>\d+)/) do |match|
        out = "<script src=\"https://gist.github.com/#{$~[:username]}#{$~[:id]}.js\"></script>"
        #puts out
        out
      end

      # also with no ""
      content.gsub!(/\[gist id="?(\d+)"? file="?([^"]*)"?\]/) do |match|
        out = "<script src=\"https://gist.github.com/#{$1}.js?file=#{$2}\"></script>"
        #puts out
        out
      end

      content.gsub!(/^\[gallery link="[^\"]*"\]/, "")

      content.gsub!(/^https?:\/\/vimeo.com\/(\d+)/) do |match|
        out = iframe("//player.vimeo.com/video/#{$1}")
        #puts out
        out
      end

      content.gsub!(/https?:\/\/(www\.)?youtu(\.)?be.com\/(watch\?v=)*(?<id>\w+)/) do |match|
        out = iframe("//www.youtube.com/embed/#{$1}")
        #puts out
        out
      end

      content.gsub!(/\[caption [^\]]*\]([^\[]*)\[\/caption\]/) do |match|
        image = $1

        # caption text from inline code
        # Example:
        #   [caption id="" align="alignnone" width="470" caption="All your #BarCampCLT are belong to us"]
        #   <img src="http://farm5.static.flickr.com/4147/5069470777_bf68feffbd.jpg" alt="" width="470" height="352" />
        #   [/caption]
        caption = /caption="([^\"]*)/.match(match)
        if caption
          caption = $1

        # otherwise it's embedded in an anchor with the caption text inline
        # Example:
        #   <a href="http://path-to-image.com/wp-content/paths/resource.jpg">
        #     <img src="http://path-to-image.com/wp-content/paths/resource.jpg" alt="Caption text" ...attributes />
        #     Caption text
        #   </a>
        else
          caption = /alt="([^\"]*)/.match(match)
          caption = $1
          image = /<img ([^\>]*)>/.match(match)
        end

        out = figure(image, caption)
        #puts out
        out
      end


      if !(created_at.nil? || title.nil? || post_date.nil? || content.nil?)
        output_filename = File.join(OUTPUT_PATH, created_at + "-" + output_filename + ".html.markdown")
        #puts output_filename

        file_content = "---" + "\n"
        file_content += "title: " + title + "\n"
        file_content += "date: " + post_date + "\n"
        file_content += "date_gmt: " + post_date_gmt + "\n"
        file_content += "authors: " + author + "\n"
        file_content += "tags:\n" +
          '  - ' + tags.join("\n  - ") + "\n"
        file_content += "layout: " + layout + "\n"
        file_content += "---" + "\n"
        file_content += content

        # Saving File
        # ------------------------------------
        File.open(output_filename, "w") do |f|
          f.write(file_content)
        end
      end
    end
  end

  def self.sanitize_filename(filename)
    filename
      .downcase
      .gsub(/[^\w\s_-]+/, '')
      .gsub(/(^|\b\s)\s+($|\s?\b)/, '\\1\\2')
      .gsub(/\s+/, '-')
      .gsub(/[-]+/, '-')
  end

  private

  def self.iframe(src)
    "<iframe src=\"#{src}\" width=\"500\" height=\"281\" frameborder=\"0\" allowfullscreen></iframe>"
  end

  def self.figure(image, caption)
    # do terrible things to get output markdown friendly-ish
    "<figure>\n#{image}\n<figcaption>#{caption}\n</figcaption>\n</figure>"
  end
end

Parser.make_output_path
Parser.xml_to_hash

	# Input: WordPress XML export file.
	# Outputs: a series of Markdown files ready to be included in a middleman site

	require 'pry'

	require 'rubygems'
	require 'nokogiri'
	require 'upmark'
	require 'html2md'
	require 'time'
	require 'fileutils'
	require 'active_support/core_ext/hash/conversions'

	# SETTINGS #
	WORDPRESS_XML_FILE_PATH = File.join(ENV['PWD'], '..', '..', '/skookumdigitalworks.wordpress.2014-06-13.xml') # THE LOCATION OF THE EXPORTED WORDPRESS ARCHIVE #
	OUTPUT_PATH = File.join(ENV['PWD'], '..', '..', 'source', 'blog') # THE LOCATION OF THE SAVED POSTS #
	ORIGINAL_DOMAIN = "http://www.skookum.com" # THE DOMAIN OF THE WEBSITE #

	class Parser

	def self.make_output_path
	unless File.directory?(OUTPUT_PATH)
	FileUtils.mkdir_p(OUTPUT_PATH)
	puts "Saving all files in" + OUTPUT_PATH.to_s
	end
	end

	def self.xml_to_hash
	f = File.open(WORDPRESS_XML_FILE_PATH)
	xml = Nokogiri::XML(f)
	hash = Hash.from_xml(xml.to_s)
	authors = hash['rss']['channel']['author']
	posts = hash['rss']['channel']['item']

	posts.each do \|post\|
	next if post['status'] != 'publish'

	# Parsing Post Frontmatter
	# ------------------------------------
	title = post['title']
	output_filename = post['link'].split('/').last
	title.gsub!(":", "-")
	post_date = post['post_date']
	post_date_gmt = post['post_date_gmt']
	created_at = Date.parse(post_date).to_s
	author = authors.find {\|a\| a['author_login'] == post['creator']}
	author = "#{author['author_first_name']} #{author['author_last_name']}"
	layout = "blog"

	tags = post['category']
	if tags.is_a?(String)
	tags = [tags]
	end

	# Parsing Post Content
	# ------------------------------------
	# content = post.at_xpath(".//content:encoded").to_s
	content = post['encoded'].join("\n\n")

	# also with /username/id
	content.gsub!(/^https?:\/\/gist.github.com\/(?<username>\w+\/)?\/?(?<id>\d+)/) do \|match\|
	out = "<script src=\"https://gist.github.com/#{$~[:username]}#{$~[:id]}.js\"></script>"
	#puts out
	out
	end

	# also with no ""
	content.gsub!(/\[gist id="?(\d+)"? file="?([^"]*)"?\]/) do \|match\|
	out = "<script src=\"https://gist.github.com/#{$1}.js?file=#{$2}\"></script>"
	#puts out
	out
	end

	content.gsub!(/^\[gallery link="[^\"]*"\]/, "")

	content.gsub!(/^https?:\/\/vimeo.com\/(\d+)/) do \|match\|
	out = iframe("//player.vimeo.com/video/#{$1}")
	#puts out
	out
	end

	content.gsub!(/https?:\/\/(www\.)?youtu(\.)?be.com\/(watch\?v=)*(?<id>\w+)/) do \|match\|
	out = iframe("//www.youtube.com/embed/#{$1}")
	#puts out
	out
	end

	content.gsub!(/\[caption [^\]]\]([^\[])\[\/caption\]/) do \|match\|
	image = $1

	# caption text from inline code
	# Example:
	# [caption id="" align="alignnone" width="470" caption="All your #BarCampCLT are belong to us"]
	# <img src="http://farm5.static.flickr.com/4147/5069470777_bf68feffbd.jpg" alt="" width="470" height="352" />
	# [/caption]
	caption = /caption="([^\"]*)/.match(match)
	if caption
	caption = $1

	# otherwise it's embedded in an anchor with the caption text inline
	# Example:
	# <a href="http://path-to-image.com/wp-content/paths/resource.jpg">
	# <img src="http://path-to-image.com/wp-content/paths/resource.jpg" alt="Caption text" ...attributes />
	# Caption text
	# </a>
	else
	caption = /alt="([^\"]*)/.match(match)
	caption = $1
	image = /<img ([^\>]*)>/.match(match)
	end

	out = figure(image, caption)
	#puts out
	out
	end


	if !(created_at.nil? \|\| title.nil? \|\| post_date.nil? \|\| content.nil?)
	output_filename = File.join(OUTPUT_PATH, created_at + "-" + output_filename + ".html.markdown")
	#puts output_filename

	file_content = "---" + "\n"
	file_content += "title: " + title + "\n"
	file_content += "date: " + post_date + "\n"
	file_content += "date_gmt: " + post_date_gmt + "\n"
	file_content += "authors: " + author + "\n"
	file_content += "tags:\n" +
	' - ' + tags.join("\n - ") + "\n"
	file_content += "layout: " + layout + "\n"
	file_content += "---" + "\n"
	file_content += content

	# Saving File
	# ------------------------------------
	File.open(output_filename, "w") do \|f\|
	f.write(file_content)
	end
	end
	end
	end

	def self.sanitize_filename(filename)
	filename
	.downcase
	.gsub(/[^\w\s_-]+/, '')
	.gsub(/(^\|\b\s)\s+($\|\s?\b)/, '\\1\\2')
	.gsub(/\s+/, '-')
	.gsub(/[-]+/, '-')
	end

	private

	def self.iframe(src)
	"<iframe src=\"#{src}\" width=\"500\" height=\"281\" frameborder=\"0\" allowfullscreen></iframe>"
	end

	def self.figure(image, caption)
	# do terrible things to get output markdown friendly-ish
	"<figure>\n#{image}\n<figcaption>#{caption}\n</figcaption>\n</figure>"
	end
	end

	Parser.make_output_path
	Parser.xml_to_hash