tomelm/wordpress_jekyll_converter.rb

## wordpress_jekyll_converter.rb
require 'date'
require 'nokogiri'
require 'rest-client'
require 'reverse_markdown'

# Match [caption <stuff>]...[/caption] tags
# example: http://rubular.com/r/r2FH3QSOpL
CAPTION_REGEX = /\[caption.*\](?=.*\[)|\[\/caption\]/


# Match the entirety of an img html tag
# example: http://rubular.com/r/xU3ZUF1vvY
IMG_TAG_REGEX = /(<img.*?>)/
IMG_SRC_REGEX = /src=".*?"/
IMG_SRC_GROUP_REGEX = /<img.*src="(.*?)".*\/?>/


# Regex's for gist and info extraction
GIST_REGEX = /\[gist.*\]/
GIST_ID_REGEX = /\[gist id="(.*?)".*\]/
GIST_FILE_REGEX = /\[gist .* file="(.*)"\]/


# Base url for all the data
BLOG_BASE_URL = 'http://engineeringblog.yelp.com'


# Open and parse the XML file using Nokogiri
#
# path - a path to the XML file to be parsed
#
# Returns a Nokogiri XML object
def open_xml_file(path)
  Nokogiri::XML(File.read(path))
end


# Extracts all of the authors from a WordPress XML file
#
# xml - a parsed, Nokogiri XML object
#
# Examples
#
#   extract_authors(xml)
#   # => { 'darwin': 'Darwin S., Software engineer'}
#
# Returns a hash table of the author's login => display name
def extract_authors(xml)
  authors = {}

  authors_xml = xml.xpath('//wp:author')
  authors_xml.each do |author|
    author_login = author.xpath('wp:author_login').first.text
    author_display = author.xpath('wp:author_display_name').first.text

    authors[author_login] = author_display
  end

  authors
end


class Post
  attr_accessor :images

  def initialize(xml, authors={})
    @xml = xml
    @author = author(authors)
    @images = []
  end

  def title
    @title ||= @xml.xpath('title').text
  end

  def author(authors={})
    @author ||= authors[@xml.xpath('dc:creator').text] || @xml.xpath('dc:creator').text
  end

  def date
    @date ||= DateTime.parse(@xml.xpath('pubDate').text)
  end

  def post_name
    @post_name ||= @xml.xpath('wp:post_name').text
  end

  def file_name
    "#{date.strftime("%Y-%m-%d")}-#{post_name}.markdown"
  end

  def front_matter
    <<-eos.gsub(/^\s+/, '')
      ---
      layout: post
      title: "#{title}"
      author: #{author}
      date: #{date}
      published: true
      ---
      \n
    eos
  end

  def content
    return @content unless @content.nil?

    cleaned_lines = []

    lines = @xml.xpath('content:encoded').text.split("\n")
    lines.each do |line|
      cleaned_lines << ReverseMarkdown.convert(clean_line(line))
    end

    cleaned_lines.join
  end

  private
  def clean_line(line)
    if line.index(IMG_TAG_REGEX)
      @images    << image_url = line[IMG_SRC_GROUP_REGEX, 1]
      image_path = "/images/posts/#{post_name}/#{File.basename(image_url)}"
      line.gsub!(IMG_SRC_REGEX, "src=#{image_path}")
    end

    cleaned_line = extract_caption(line, image_path) if line =~ CAPTION_REGEX
    cleaned_line = extract_gist(line)                if line =~ GIST_REGEX
    cleaned_line = "<p>#{line}</p>"                  if cleaned_line.nil?

    cleaned_line.gsub!('’', "'")  # fix unicode apostrophe issues

    cleaned_line
  end

  def extract_caption(line, image_url)
    # TODO figure out how I want to handle images and captions later
    # ref: http://stackoverflow.com/questions/19331362/using-an-image-caption-in-markdown-jekyll

    caption = Nokogiri::HTML(line).text
                                  .strip
                                  .sub(CAPTION_REGEX, '')
                                  .sub(CAPTION_REGEX, '')  # remove closing caption

    return "{% include post/image.html image=\"#{image_url}\" caption=\"#{caption}\" %}"
  end

  def extract_gist(line)
    "{{ gist #{line[GIST_ID_REGEX, 1]} #{line[GIST_FILE_REGEX, 1]} }}"
  end
end


xml = open_xml_file('./blog.xml')
authors = extract_authors(xml)
raw_posts = xml.xpath('//item')

Dir.mkdir('_posts')
Dir.mkdir('images')
Dir.mkdir('images/posts')

puts 'Converting posts'
posts = raw_posts.collect {|p| Post.new(p, authors)}

puts 'Processing and writing posts, images'
posts.each do |post|
  puts post.file_name

  File.write("_posts/#{post.file_name}", post.front_matter + post.content)

  image_dir = 'images/posts/' + post.post_name
  Dir.mkdir(image_dir)

  post.images.each do |image|
    next if image.nil?
    image = BLOG_BASE_URL + image unless image.index('http')

    puts "|--> #{image}"

    begin
      open("#{image_dir}/#{File.basename(image)}", 'wb') do |file|
        file.write(RestClient.get(image))
      end
    rescue Exception => e
      puts "failed to download #{image} - #{e.message}"
    end
  end
end
	require 'date'
	require 'nokogiri'
	require 'rest-client'
	require 'reverse_markdown'

	# Match [caption <stuff>]...[/caption] tags
	# example: http://rubular.com/r/r2FH3QSOpL
	CAPTION_REGEX = /\[caption.\](?=.\[)\|\[\/caption\]/


	# Match the entirety of an img html tag
	# example: http://rubular.com/r/xU3ZUF1vvY
	IMG_TAG_REGEX = /(<img.*?>)/
	IMG_SRC_REGEX = /src=".*?"/
	IMG_SRC_GROUP_REGEX = /<img.src="(.?)".*\/?>/


	# Regex's for gist and info extraction
	GIST_REGEX = /\[gist.*\]/
	GIST_ID_REGEX = /\[gist id="(.?)".\]/
	GIST_FILE_REGEX = /\[gist .* file="(.*)"\]/


	# Base url for all the data
	BLOG_BASE_URL = 'http://engineeringblog.yelp.com'


	# Open and parse the XML file using Nokogiri
	#
	# path - a path to the XML file to be parsed
	#
	# Returns a Nokogiri XML object
	def open_xml_file(path)
	Nokogiri::XML(File.read(path))
	end


	# Extracts all of the authors from a WordPress XML file
	#
	# xml - a parsed, Nokogiri XML object
	#
	# Examples
	#
	# extract_authors(xml)
	# # => { 'darwin': 'Darwin S., Software engineer'}
	#
	# Returns a hash table of the author's login => display name
	def extract_authors(xml)
	authors = {}

	authors_xml = xml.xpath('//wp:author')
	authors_xml.each do \|author\|
	author_login = author.xpath('wp:author_login').first.text
	author_display = author.xpath('wp:author_display_name').first.text

	authors[author_login] = author_display
	end

	authors
	end


	class Post
	attr_accessor :images

	def initialize(xml, authors={})
	@xml = xml
	@author = author(authors)
	@images = []
	end

	def title
	@title \|\|= @xml.xpath('title').text
	end

	def author(authors={})
	@author \|\|= authors[@xml.xpath('dc:creator').text] \|\| @xml.xpath('dc:creator').text
	end

	def date
	@date \|\|= DateTime.parse(@xml.xpath('pubDate').text)
	end

	def post_name
	@post_name \|\|= @xml.xpath('wp:post_name').text
	end

	def file_name
	"#{date.strftime("%Y-%m-%d")}-#{post_name}.markdown"
	end

	def front_matter
	<<-eos.gsub(/^\s+/, '')
	---
	layout: post
	title: "#{title}"
	author: #{author}
	date: #{date}
	published: true
	---
	\n
	eos
	end

	def content
	return @content unless @content.nil?

	cleaned_lines = []

	lines = @xml.xpath('content:encoded').text.split("\n")
	lines.each do \|line\|
	cleaned_lines << ReverseMarkdown.convert(clean_line(line))
	end

	cleaned_lines.join
	end

	private
	def clean_line(line)
	if line.index(IMG_TAG_REGEX)
	@images << image_url = line[IMG_SRC_GROUP_REGEX, 1]
	image_path = "/images/posts/#{post_name}/#{File.basename(image_url)}"
	line.gsub!(IMG_SRC_REGEX, "src=#{image_path}")
	end

	cleaned_line = extract_caption(line, image_path) if line =~ CAPTION_REGEX
	cleaned_line = extract_gist(line) if line =~ GIST_REGEX
	cleaned_line = "<p>#{line}</p>" if cleaned_line.nil?

	cleaned_line.gsub!('’', "'") # fix unicode apostrophe issues

	cleaned_line
	end

	def extract_caption(line, image_url)
	# TODO figure out how I want to handle images and captions later
	# ref: http://stackoverflow.com/questions/19331362/using-an-image-caption-in-markdown-jekyll

	caption = Nokogiri::HTML(line).text
	.strip
	.sub(CAPTION_REGEX, '')
	.sub(CAPTION_REGEX, '') # remove closing caption

	return "{% include post/image.html image=\"#{image_url}\" caption=\"#{caption}\" %}"
	end

	def extract_gist(line)
	"{{ gist #{line[GIST_ID_REGEX, 1]} #{line[GIST_FILE_REGEX, 1]} }}"
	end
	end


	xml = open_xml_file('./blog.xml')
	authors = extract_authors(xml)
	raw_posts = xml.xpath('//item')

	Dir.mkdir('_posts')
	Dir.mkdir('images')
	Dir.mkdir('images/posts')

	puts 'Converting posts'
	posts = raw_posts.collect {\|p\| Post.new(p, authors)}

	puts 'Processing and writing posts, images'
	posts.each do \|post\|
	puts post.file_name

	File.write("_posts/#{post.file_name}", post.front_matter + post.content)

	image_dir = 'images/posts/' + post.post_name
	Dir.mkdir(image_dir)

	post.images.each do \|image\|
	next if image.nil?
	image = BLOG_BASE_URL + image unless image.index('http')

	puts "\|--> #{image}"

	begin
	open("#{image_dir}/#{File.basename(image)}", 'wb') do \|file\|
	file.write(RestClient.get(image))
	end
	rescue Exception => e
	puts "failed to download #{image} - #{e.message}"
	end
	end
	end