carlbrown/squarespace_to_octopress.rb

## squarespace_to_octopress.rb
#!/bin/env ruby

require 'rubygems'
require 'fileutils'
require 'pathname'
require 'pp'
require 'date'
require 'uri'
require 'html2markdown'

# usage: ruby squarespace_to_octopress.rb squarespace_export.txt octopress _posts directory
# squarespace_export.txt is a file from JournalPage -> Configuration -> Export Data in Squarespace.
# octopress _posts directory is your octopress/source/_posts directory
# crawled site root directory is an optional argument that specifies a directory on local disk that is a crawled
#                             mirror of your site - used to extracting tags

if (ARGV.length != 2 && ARGV.length != 3)
  $stderr.puts "Usage: #{$0} <squarespace_export.txt> <octopress _posts directory> [ crawled site root directory ]"
  exit 2
end


@posts = []

def reset_state
  @current = {:date => "", :title => "", :body => "", :markdown => "", :author => "", :allow_comments => false, :published => false, :categories => []}

  @inBody = false
  @inHeader = true #The file start with the first header
  @inComments = false
end

File.open(ARGV[0]) do |f|
  reset_state

  f.each_line do |line|
    if line.strip == "--------"
      @posts.push @current if (@current[:title] != "")
      reset_state
    elsif @inBody
      if line.strip == "-----"
        p = HTMLPage.new :contents => @current[:body]
        @current[:markdown]=p.markdown
        @inBody = false
      else
        @current[:body] += line
      end
    elsif @inHeader
      if line.strip == "-----"
        @inHeader = false
      elsif (line =~ /^AUTHOR:/)
        @current[:author] = line.strip.gsub(/^AUTHOR: /,'')
      elsif (line =~ /^TITLE:/)
        @current[:title] = line.strip.gsub(/^TITLE: /,'')
      elsif (line =~ /^DATE:/)
        @current[:date] = Date.strptime line.strip.gsub(/^DATE: /,'') , '%m/%d/%Y %I:%M:%S %p'
      elsif (line =~ /^CATEGORY:/)
        @current[:categories].push line.strip.gsub(/^CATEGORY: /,'')
      elsif (line =~ /^ALLOW COMMENTS:/)
        @current[:allow_comments] = (line.strip.gsub(/^ALLOW COMMENTS: /,'') == "1")
      elsif (line =~ /^STATUS:/)
        @current[:published] = (line.strip.gsub(/^STATUS: /,'') == "Publish")
      elsif line.strip == "-----"
          @inHeader = false
      end
    elsif @inComments
      if line.strip == "-----"
        @inComments = false
      else
        $stderr.puts "Warning: Skipping unsupported comment"
      end
    else
      if line.strip == "BODY:"
        @inBody = true
      elsif line.strip == "COMMENT:"
          @inComments = true
      end
    end
  end
  @posts.push @current if (@current[:title] != "")
end

count=0

@posts.each { |p|
  # .gsub(':','%3a')
  # http://www.escortmissions.com/blog/2012/7/7/new-work-in-progress-million-words-multiplayer-crossword-gam.html
  # http://www.escortmissions.com/blog/2012/6/10/my-top-5-factors-for-ios-contracting-success.html
  sqsp_title=p[:title].downcase.gsub(/[^a-z0-9\- ]/,'').gsub(/ /,'-').gsub(/--*/,'-')[ 0 .. 59 ].sub(/-$/,'')
  sqsp_slug="#{p[:date].strftime('%Y/%-m/%-d')}/#{sqsp_title}.html"
  if (ARGV.length == 3)
    crawled_file_name = "#{ARGV[2]}/blog/#{sqsp_slug}"
    pn = Pathname.new(crawled_file_name)
    if (!pn.exist?)
      ## try the previous day - I have a tendency to post aroud midnight
      previous_day = (p[:date]-1)
      previous_day_crawled_file_name = "#{ARGV[2]}/blog/#{previous_day.strftime('%Y/%-m/%-d')}/#{sqsp_title}.html"
      pn_prev = Pathname.new(previous_day_crawled_file_name)
      if (pn_prev.exist?)
        sqsp_slug="#{previous_day.strftime('%Y/%-m/%-d')}/#{sqsp_title}.html"
        pn=pn_prev
        crawled_file_name=previous_day_crawled_file_name
      end
    end
    if (!pn.exist?)
      $stderr.puts "Warning, Couldn't find file #{crawled_file_name}"
    end
  end
  filename = sqsp_title.sub(/$/,'.markdown').sub(/^/,p[:date].strftime('%Y-%m-%d-'))

  File.open("#{ARGV[1]}/#{filename}", 'w') { |file|
    file.puts("---")
    file.puts("layout: post")
    file.puts("title: #{p[:title].gsub(':','&#58;')}")
    if (p[:date].is_a?(Date))
      file.puts("date: #{p[:date].strftime('%Y-%m-%d %H:%M')}")
    else
      $stderr.puts "Bad date: #{p[:date]} on post #{count}:#{filename}"
    end
    file.puts("permalink: /blog/#{sqsp_slug}")
    file.puts("comments: #{p[:allow_comments]}")
    if (p[:categories].length > 0)
      file.puts("categories: #{p[:categories]}")
    end
    if (crawled_file_name)
      if (pn.exist?)
        crawled_page = Nokogiri::HTML(open(crawled_file_name))
        tags=[]
        crawled_page.css("span[class='tag-element']").each {|tag|
          tags.push tag.text.strip
        }
        if (tags.length>0)
          file.puts("tags: #{tags}")
        end
      end
    end
    file.puts("---")
    file.puts(p[:markdown])
  }
  count += 1;
}
	#!/bin/env ruby

	require 'rubygems'
	require 'fileutils'
	require 'pathname'
	require 'pp'
	require 'date'
	require 'uri'
	require 'html2markdown'

	# usage: ruby squarespace_to_octopress.rb squarespace_export.txt octopress _posts directory
	# squarespace_export.txt is a file from JournalPage -> Configuration -> Export Data in Squarespace.
	# octopress _posts directory is your octopress/source/_posts directory
	# crawled site root directory is an optional argument that specifies a directory on local disk that is a crawled
	# mirror of your site - used to extracting tags

	if (ARGV.length != 2 && ARGV.length != 3)
	$stderr.puts "Usage: #{$0} <squarespace_export.txt> <octopress _posts directory> [ crawled site root directory ]"
	exit 2
	end



	@posts = []

	def reset_state
	@current = {:date => "", :title => "", :body => "", :markdown => "", :author => "", :allow_comments => false, :published => false, :categories => []}

	@inBody = false
	@inHeader = true #The file start with the first header
	@inComments = false
	end

	File.open(ARGV[0]) do \|f\|
	reset_state

	f.each_line do \|line\|
	if line.strip == "--------"
	@posts.push @current if (@current[:title] != "")
	reset_state
	elsif @inBody
	if line.strip == "-----"
	p = HTMLPage.new :contents => @current[:body]
	@current[:markdown]=p.markdown
	@inBody = false
	else
	@current[:body] += line
	end
	elsif @inHeader
	if line.strip == "-----"
	@inHeader = false
	elsif (line =~ /^AUTHOR:/)
	@current[:author] = line.strip.gsub(/^AUTHOR: /,'')
	elsif (line =~ /^TITLE:/)
	@current[:title] = line.strip.gsub(/^TITLE: /,'')
	elsif (line =~ /^DATE:/)
	@current[:date] = Date.strptime line.strip.gsub(/^DATE: /,'') , '%m/%d/%Y %I:%M:%S %p'
	elsif (line =~ /^CATEGORY:/)
	@current[:categories].push line.strip.gsub(/^CATEGORY: /,'')
	elsif (line =~ /^ALLOW COMMENTS:/)
	@current[:allow_comments] = (line.strip.gsub(/^ALLOW COMMENTS: /,'') == "1")
	elsif (line =~ /^STATUS:/)
	@current[:published] = (line.strip.gsub(/^STATUS: /,'') == "Publish")
	elsif line.strip == "-----"
	@inHeader = false
	end
	elsif @inComments
	if line.strip == "-----"
	@inComments = false
	else
	$stderr.puts "Warning: Skipping unsupported comment"
	end
	else
	if line.strip == "BODY:"
	@inBody = true
	elsif line.strip == "COMMENT:"
	@inComments = true
	end
	end
	end
	@posts.push @current if (@current[:title] != "")
	end

	count=0

	@posts.each { \|p\|
	# .gsub(':','%3a')
	# http://www.escortmissions.com/blog/2012/7/7/new-work-in-progress-million-words-multiplayer-crossword-gam.html
	# http://www.escortmissions.com/blog/2012/6/10/my-top-5-factors-for-ios-contracting-success.html
	sqsp_title=p[:title].downcase.gsub(/[^a-z0-9\- ]/,'').gsub(/ /,'-').gsub(/--*/,'-')[ 0 .. 59 ].sub(/-$/,'')
	sqsp_slug="#{p[:date].strftime('%Y/%-m/%-d')}/#{sqsp_title}.html"
	if (ARGV.length == 3)
	crawled_file_name = "#{ARGV[2]}/blog/#{sqsp_slug}"
	pn = Pathname.new(crawled_file_name)
	if (!pn.exist?)
	## try the previous day - I have a tendency to post aroud midnight
	previous_day = (p[:date]-1)
	previous_day_crawled_file_name = "#{ARGV[2]}/blog/#{previous_day.strftime('%Y/%-m/%-d')}/#{sqsp_title}.html"
	pn_prev = Pathname.new(previous_day_crawled_file_name)
	if (pn_prev.exist?)
	sqsp_slug="#{previous_day.strftime('%Y/%-m/%-d')}/#{sqsp_title}.html"
	pn=pn_prev
	crawled_file_name=previous_day_crawled_file_name
	end
	end
	if (!pn.exist?)
	$stderr.puts "Warning, Couldn't find file #{crawled_file_name}"
	end
	end
	filename = sqsp_title.sub(/$/,'.markdown').sub(/^/,p[:date].strftime('%Y-%m-%d-'))

	File.open("#{ARGV[1]}/#{filename}", 'w') { \|file\|
	file.puts("---")
	file.puts("layout: post")
	file.puts("title: #{p[:title].gsub(':',':')}")
	if (p[:date].is_a?(Date))
	file.puts("date: #{p[:date].strftime('%Y-%m-%d %H:%M')}")
	else
	$stderr.puts "Bad date: #{p[:date]} on post #{count}:#{filename}"
	end
	file.puts("permalink: /blog/#{sqsp_slug}")
	file.puts("comments: #{p[:allow_comments]}")
	if (p[:categories].length > 0)
	file.puts("categories: #{p[:categories]}")
	end
	if (crawled_file_name)
	if (pn.exist?)
	crawled_page = Nokogiri::HTML(open(crawled_file_name))
	tags=[]
	crawled_page.css("span[class='tag-element']").each {\|tag\|
	tags.push tag.text.strip
	}
	if (tags.length>0)
	file.puts("tags: #{tags}")
	end
	end
	end
	file.puts("---")
	file.puts(p[:markdown])
	}
	count += 1;
	}