Skip to content

Instantly share code, notes, and snippets.

@tomelm
Last active August 29, 2015 14:24
Show Gist options
  • Save tomelm/6f309818553367c28502 to your computer and use it in GitHub Desktop.
Save tomelm/6f309818553367c28502 to your computer and use it in GitHub Desktop.
require 'date'
require 'nokogiri'
require 'rest-client'
require 'reverse_markdown'
# Match [caption <stuff>]...[/caption] tags
# example: http://rubular.com/r/r2FH3QSOpL
CAPTION_REGEX = /\[caption.*\](?=.*\[)|\[\/caption\]/
# Match the entirety of an img html tag
# example: http://rubular.com/r/xU3ZUF1vvY
IMG_TAG_REGEX = /(<img.*?>)/
IMG_SRC_REGEX = /src=".*?"/
IMG_SRC_GROUP_REGEX = /<img.*src="(.*?)".*\/?>/
# Regex's for gist and info extraction
GIST_REGEX = /\[gist.*\]/
GIST_ID_REGEX = /\[gist id="(.*?)".*\]/
GIST_FILE_REGEX = /\[gist .* file="(.*)"\]/
# Base url for all the data
BLOG_BASE_URL = 'http://engineeringblog.yelp.com'
# Open and parse the XML file using Nokogiri
#
# path - a path to the XML file to be parsed
#
# Returns a Nokogiri XML object
def open_xml_file(path)
Nokogiri::XML(File.read(path))
end
# Extracts all of the authors from a WordPress XML file
#
# xml - a parsed, Nokogiri XML object
#
# Examples
#
# extract_authors(xml)
# # => { 'darwin': 'Darwin S., Software engineer'}
#
# Returns a hash table of the author's login => display name
def extract_authors(xml)
authors = {}
authors_xml = xml.xpath('//wp:author')
authors_xml.each do |author|
author_login = author.xpath('wp:author_login').first.text
author_display = author.xpath('wp:author_display_name').first.text
authors[author_login] = author_display
end
authors
end
class Post
attr_accessor :images
def initialize(xml, authors={})
@xml = xml
@author = author(authors)
@images = []
end
def title
@title ||= @xml.xpath('title').text
end
def author(authors={})
@author ||= authors[@xml.xpath('dc:creator').text] || @xml.xpath('dc:creator').text
end
def date
@date ||= DateTime.parse(@xml.xpath('pubDate').text)
end
def post_name
@post_name ||= @xml.xpath('wp:post_name').text
end
def file_name
"#{date.strftime("%Y-%m-%d")}-#{post_name}.markdown"
end
def front_matter
<<-eos.gsub(/^\s+/, '')
---
layout: post
title: "#{title}"
author: #{author}
date: #{date}
published: true
---
\n
eos
end
def content
return @content unless @content.nil?
cleaned_lines = []
lines = @xml.xpath('content:encoded').text.split("\n")
lines.each do |line|
cleaned_lines << ReverseMarkdown.convert(clean_line(line))
end
cleaned_lines.join
end
private
def clean_line(line)
if line.index(IMG_TAG_REGEX)
@images << image_url = line[IMG_SRC_GROUP_REGEX, 1]
image_path = "/images/posts/#{post_name}/#{File.basename(image_url)}"
line.gsub!(IMG_SRC_REGEX, "src=#{image_path}")
end
cleaned_line = extract_caption(line, image_path) if line =~ CAPTION_REGEX
cleaned_line = extract_gist(line) if line =~ GIST_REGEX
cleaned_line = "<p>#{line}</p>" if cleaned_line.nil?
cleaned_line.gsub!('’', "'") # fix unicode apostrophe issues
cleaned_line
end
def extract_caption(line, image_url)
# TODO figure out how I want to handle images and captions later
# ref: http://stackoverflow.com/questions/19331362/using-an-image-caption-in-markdown-jekyll
caption = Nokogiri::HTML(line).text
.strip
.sub(CAPTION_REGEX, '')
.sub(CAPTION_REGEX, '') # remove closing caption
return "{% include post/image.html image=\"#{image_url}\" caption=\"#{caption}\" %}"
end
def extract_gist(line)
"{{ gist #{line[GIST_ID_REGEX, 1]} #{line[GIST_FILE_REGEX, 1]} }}"
end
end
xml = open_xml_file('./blog.xml')
authors = extract_authors(xml)
raw_posts = xml.xpath('//item')
Dir.mkdir('_posts')
Dir.mkdir('images')
Dir.mkdir('images/posts')
puts 'Converting posts'
posts = raw_posts.collect {|p| Post.new(p, authors)}
puts 'Processing and writing posts, images'
posts.each do |post|
puts post.file_name
File.write("_posts/#{post.file_name}", post.front_matter + post.content)
image_dir = 'images/posts/' + post.post_name
Dir.mkdir(image_dir)
post.images.each do |image|
next if image.nil?
image = BLOG_BASE_URL + image unless image.index('http')
puts "|--> #{image}"
begin
open("#{image_dir}/#{File.basename(image)}", 'wb') do |file|
file.write(RestClient.get(image))
end
rescue Exception => e
puts "failed to download #{image} - #{e.message}"
end
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment