ashfurrow/Readme.md

## Readme.md

      
    Raw
  

              Readme.md
            
          
    Wordpress-to-Middleman

So I'm moving my site from Squarespace to Middleman, a static website engine. Squarespace lets you export your content in a Wordpress-compatible XML file. I wrote this script to generate Middleman blog entries corresponding to published posts from the Squarespace exported XML file.
What it Does

So what does it do? It finds all published text posts and reconfigures them for use with Middleman. All Squarespace-hosted images are downloaded to your computer and given unique names, and the img tags in the posts themselves are updated. I also fix a few other things like Instagram and Vimeo embeds and added some bootstrap CSS classes to img tags. It's pretty customized to my needs – not a general-purpose script.
Instructions

Make sure Nokogiri is installed. Download, then chmod a+x the script.rb file, and run.
Caution

This worked for me – that does not mean it will work for you. I am not a ruby developer, so this code is probably really bad. I'm happy to answer questions on twitter.

  
## script.rb
#!/usr/bin/ruby

require 'Nokogiri'
require 'net/http'
require 'securerandom'
require 'FileUtils'
require 'date'

class Post
  attr_accessor :title
  attr_accessor :raw_content
  attr_accessor :pub_date
  attr_accessor :link

  def initialize(node)
    @title = node.xpath("title").text
    @raw_content = node.xpath("content:encoded").text
    @pub_date = node.xpath("pubDate").text
    @link = node.xpath("link").text
  end

  def generate_markdown
    content_node = Nokogiri::HTML("<import>#{ @raw_content }</import>")

    # Find all images on the Squarespace CDN
    images = content_node.xpath("//img").select { |image| image["src"] and image["src"].include? "squarespace.com" }

    images.each { |image|
      # download them and put them in the correct folder

      url = URI.parse(image["src"])
      response = Net::HTTP.get_response url
      content_type = response["Content-Type"]

      extension = ""
      if content_type.include? "image/jpg" || content_type.include? "image/jpeg"
        extension = "jpg"
      elsif content_type.include? "image/gif"
        extension = "gif"
      elsif content_type.include? "image/png"
        extension = "png"
      else
        puts "Warning – unknown file (defaulting to jpeg): #{ image["src"] }"
        extension = "jpeg"
      end

      directory = "img/import#{ link }/"
      filename = "#{ directory }#{ SecureRandom.uuid.gsub('-','').upcase }.#{ extension }"

      FileUtils.mkdir_p directory
      IO.write(filename, response.body)

      puts "Wrote #{ filename } to disk"

      # Change the content_node's img children to point to their new files

      image["src"] = "/#{ filename }"
      image["class"] = "img-responsive"
    }

    # Fix all the iframes
    iframes = content_node.xpath("//iframe")
    iframes.wrap("<div class='embed-responsive embed-responsive-16by9'></div>")
    iframes.each { |iframe| iframe["class"] = "embed-responsive-item" }

    # Remove Instagram ickiness

    content_node.xpath("//div[contains(@class,\"instagram-oembed\")]/p").each { |node| node.remove }

    # Finally, generate the markdown file from the content_node

    date = Date.parse(@pub_date)

    directory = "blog"
    FileUtils.mkdir_p directory
    filename = "#{ directory }/#{ date.strftime('%Y-%m-%d') }-#{ @title.gsub(' ', '-').gsub(/['"?\/:]/, '').downcase }.markdown"

    body = <<-eos
---
title: "#{ @title}"
date: #{ date.strftime('%Y-%m-%d %H:%M') }
---

#{ content_node.xpath("//import").first }

<!-- more -->

    eos

    IO.write(filename, body)

    puts "Wrote #{ filename }"
  end

  def to_s
    "#{ @title } published on #{ pub_date }"
  end
end

filename = ARGV.first
abort "Usage: ./script path_of_xml_file" unless filename
abort "File does not exist" unless  File.exist?(filename)

file = File.open(filename)
doc = Nokogiri::XML(file)
file.close

puts "Opened XML file at " + filename

post_nodes = doc.xpath("//item").select { | item | item.xpath("wp:post_type/text()").text == "post" && item.xpath("wp:status/text()").text == "publish" }

posts = post_nodes.map { |node| Post.new(node) }

# puts posts[0].generate_markdown
posts.each { |post| post.generate_markdown }
	#!/usr/bin/ruby

	require 'Nokogiri'
	require 'net/http'
	require 'securerandom'
	require 'FileUtils'
	require 'date'

	class Post
	attr_accessor :title
	attr_accessor :raw_content
	attr_accessor :pub_date
	attr_accessor :link

	def initialize(node)
	@title = node.xpath("title").text
	@raw_content = node.xpath("content:encoded").text
	@pub_date = node.xpath("pubDate").text
	@link = node.xpath("link").text
	end

	def generate_markdown
	content_node = Nokogiri::HTML("<import>#{ @raw_content }</import>")

	# Find all images on the Squarespace CDN
	images = content_node.xpath("//img").select { \|image\| image["src"] and image["src"].include? "squarespace.com" }

	images.each { \|image\|
	# download them and put them in the correct folder

	url = URI.parse(image["src"])
	response = Net::HTTP.get_response url
	content_type = response["Content-Type"]

	extension = ""
	if content_type.include? "image/jpg" \|\| content_type.include? "image/jpeg"
	extension = "jpg"
	elsif content_type.include? "image/gif"
	extension = "gif"
	elsif content_type.include? "image/png"
	extension = "png"
	else
	puts "Warning – unknown file (defaulting to jpeg): #{ image["src"] }"
	extension = "jpeg"
	end

	directory = "img/import#{ link }/"
	filename = "#{ directory }#{ SecureRandom.uuid.gsub('-','').upcase }.#{ extension }"

	FileUtils.mkdir_p directory
	IO.write(filename, response.body)

	puts "Wrote #{ filename } to disk"

	# Change the content_node's img children to point to their new files

	image["src"] = "/#{ filename }"
	image["class"] = "img-responsive"
	}

	# Fix all the iframes
	iframes = content_node.xpath("//iframe")
	iframes.wrap("<div class='embed-responsive embed-responsive-16by9'></div>")
	iframes.each { \|iframe\| iframe["class"] = "embed-responsive-item" }

	# Remove Instagram ickiness

	content_node.xpath("//div[contains(@class,\"instagram-oembed\")]/p").each { \|node\| node.remove }

	# Finally, generate the markdown file from the content_node

	date = Date.parse(@pub_date)

	directory = "blog"
	FileUtils.mkdir_p directory
	filename = "#{ directory }/#{ date.strftime('%Y-%m-%d') }-#{ @title.gsub(' ', '-').gsub(/['"?\/:]/, '').downcase }.markdown"

	body = <<-eos
	---
	title: "#{ @title}"
	date: #{ date.strftime('%Y-%m-%d %H:%M') }
	---

	#{ content_node.xpath("//import").first }

	<!-- more -->

	eos

	IO.write(filename, body)

	puts "Wrote #{ filename }"
	end

	def to_s
	"#{ @title } published on #{ pub_date }"
	end
	end

	filename = ARGV.first
	abort "Usage: ./script path_of_xml_file" unless filename
	abort "File does not exist" unless File.exist?(filename)

	file = File.open(filename)
	doc = Nokogiri::XML(file)
	file.close

	puts "Opened XML file at " + filename

	post_nodes = doc.xpath("//item").select { \| item \| item.xpath("wp:post_type/text()").text == "post" && item.xpath("wp:status/text()").text == "publish" }

	posts = post_nodes.map { \|node\| Post.new(node) }

	# puts posts[0].generate_markdown
	posts.each { \|post\| post.generate_markdown }