Create a gist now

Instantly share code, notes, and snippets.

@mttmccb /import.rb forked from spiffytech/import.rb
Last active Dec 14, 2016

What would you like to do?
Imports a Squarespace dump into Jekyll
# coding: utf-8
require 'rubygems'
require 'nokogiri'
require 'fileutils'
require 'safe_yaml'
require 'time'
require 'uri'
require 'open-uri'
module JekyllImport
# This importer takes a wordpress.xml file, which can be exported from your
# wordpress.com blog (/wp-admin/export.php).
module WordpressDotCom
attr :image_folder
@image_folder = 'squarespace_images'
def self.download_image(src, dest)
return if ::File.exist? dest # Speed-up for when importing multiple times
File.open(dest, "wb") do |saved_file|
# the following "open" is provided by open-uri
open(src, "rb") do |read_file|
saved_file.write(read_file.read)
end
end
end
def self.process(filename = {:source => "_wordpress.xml"})
Dir.mkdir @image_folder unless ::File.exist? @image_folder
import_count = Hash.new(0)
doc = Nokogiri::XML(File.read(filename[:source]))
(doc/:channel/:item).each do |item|
title = item.at(:title).inner_text.strip
permalink_title = item.at('wppost_name').inner_text.gsub("/","-")
# Fallback to "prettified" title if post_name is empty (can happen)
if permalink_title == ""
permalink_title = sluggify(title)
end
if item.at('wppost_date')
begin
date = Time.parse(item.at('wppost_date').inner_text)
rescue
date = Time.now
end
else
date = Time.now
end
status = item.at('wpstatus').inner_text
if status == "publish"
published = true
else
published = false
end
type = item.at('wppost_type').inner_text
categories = item.search('category[@domain="category"]').map{|c| c.inner_text}.reject{|c| c == 'Uncategorized'}.uniq
tags = item.search('category[@domain="post_tag"]').map{|t| t.inner_text}.uniq
metas = Hash.new
item.search("wppostmeta").each do |meta|
key = meta.at('wpmeta_key').inner_text
value = meta.at('wpmeta_value').inner_text
metas[key] = value;
end
excerpt = item.at('excerptencoded').inner_text
# Identify Squarespace-hosted images, download them, and update the
# URLs to point to our copies
body = item.at('contentencoded').inner_text
body = body.gsub(/\[\/?caption[^]]*\]/, '') # Remove caption blocks which don't render properly
doc = Nokogiri::HTML(body)
doc.css('img').each do |element|
puts element
src = element['src']
u = URI src
if u.host.end_with? 'squarespace.com'
filename = u.path.sub(/^\//, '').gsub('/', '_') + '_' + (u.fragment || '')
dest = ::File.join(@image_folder, filename)
download_image(src, dest)
element['src'] = '/' + dest
end
end
body = doc.to_s
name = "#{date.strftime('%Y-%m-%d')}-#{permalink_title}.html"
header = {
'title' => title,
'categories' => categories,
'tags' => tags
}
begin
FileUtils.mkdir_p "_#{type}s"
File.open("_#{type}s/#{name}", "w") do |f|
f.puts header.to_yaml
f.puts '---'
f.puts body
end
rescue => e
puts "Couldn't import post!"
puts "Title: #{title}"
puts "Name/Slug: #{name}\n"
puts "Error: #{e.message}"
next
end
import_count[type] += 1
end
import_count.each do |key, value|
puts "Imported #{value} #{key}s"
end
end
def self.sluggify(title)
title.gsub(/[^[:alnum:]]+/, '-').downcase
end
end
end
JekyllImport::WordpressDotCom.process
#!/usr/bin/env ruby
require 'html2markdown'
POST_REGEX = %r{(?<year>[0-9]+)-(?<month>[0-9]+)-(?<day>[0-9]+)-(?<title>.*).html}
files = Dir.glob('*.html').select{ |f| f.match POST_REGEX }
files.each do |post|
data = post.match(POST_REGEX)
p = HTMLPage.new(contents: File.read(post))
File.open(post, 'w') { |f| f.puts p.markdown }
File.rename(post, "#{data[:year]}-#{data[:month]}-#{data[:day]}-#{data[:title]}.md")
end
Owner

mttmccb commented Dec 14, 2016

Not sure why but I could read node with a : in them so I had to do a bit of find and replace. Also hpricot didn't seem to work.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment